load libraries

PRS****************************************************************************************************************

Input the PRS data

# Read the PRS data
prs_data <- read.table('/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/PRS_array.txt', header = TRUE, sep = '\t', stringsAsFactors = FALSE)

Set color palette

group_colors  <- c("grey","#3C8C78") 

Define your list of traits to analyze

# List of traits to analyze
traits <- c("Heart_rate", "LQTS", "PR_interval", "QTc", "Afib", "HRV", "BP", "QRS", "HCM", "LVEDV","LVEDVI","LVEF","LVESV","LVESVI","SV","SVI","GGE","Focal_Epilepsy")
traits
##  [1] "Heart_rate"     "LQTS"           "PR_interval"    "QTc"           
##  [5] "Afib"           "HRV"            "BP"             "QRS"           
##  [9] "HCM"            "LVEDV"          "LVEDVI"         "LVEF"          
## [13] "LVESV"          "LVESVI"         "SV"             "SVI"           
## [17] "GGE"            "Focal_Epilepsy"

Z normalize the scores

prs_data <- prs_data %>%
  mutate(across(all_of(traits), ~ scale(.x) %>% as.numeric, .names = "Normalized_{.col}"))

Format the data

# Categorize a deprecated input structure
prs_data <- prs_data %>%
  mutate(arrest_group = case_when(
    arrest_ontology == "control" ~ "Control",
    arrest_ontology == "case" ~ "Case",
    TRUE ~ as.character(arrest_ontology)
  ))

# Filter out only 'Control' and 'Case'
filtered_data <- prs_data %>%
  filter(arrest_group %in% c("Control", "Case"))

# Generate normalized column names based on the traits list
normalized_traits <- paste("Normalized", traits, sep = "_")

# Filter out non-existing columns from the normalized_traits
existing_normalized_traits <- normalized_traits[normalized_traits %in% names(filtered_data)]

# Reshape the filtered data to long format for the normalized traits
melted_data_normalized <- reshape2::melt(filtered_data, 
                                         id.vars = "arrest_group", 
                                         measure.vars = existing_normalized_traits)

Split into discovery or replication

prs_data_discovery <- filtered_data %>%
  filter(Set == "Discovery")

prs_data_replication <- filtered_data %>%
  filter(Set == "Replication")

Make the Correlation plot

# Select only the relevant columns for correlation
data_for_correlation <- prs_data_discovery[, normalized_traits]

#order the traits
corr_traits <- c(
  "Normalized_QRS", 
  "Normalized_BP",
  "Normalized_SVI",
  "Normalized_LVESV", 
  "Normalized_PR_interval", 
  "Normalized_HRV", 
  "Normalized_GGE", 
  "Normalized_LVEDV",
  "Normalized_LVESVI", 
  "Normalized_Focal_Epilepsy", 
  "Normalized_HCM", 
  "Normalized_LQTS", 
  "Normalized_Afib", 
  "Normalized_LVEDVI", 
  "Normalized_SV", 
  "Normalized_Heart_rate", 
  "Normalized_QTc", 
  "Normalized_LVEF"
)


# Ensure the DataFrame is ordered according to trait preferences
data_for_correlation_ordered <- data_for_correlation[, corr_traits]

# Calculate correlation matrix
cor_matrix_ordered <- cor(data_for_correlation_ordered, use = "complete.obs")

#Plot correlation
corrplot(cor_matrix_ordered, 
         method = "color",  
         type = "full",
         tl.col = "black",
         tl.srt = 45,
         cl.ratio = 0.2,
         col = colorRampPalette(c("#05618F", "white", "#F0BE3C"))(200),
         diag = FALSE,
         addgrid.col = "black"  
)

#Show the results
correlation_results <- rcorr(as.matrix(data_for_correlation_ordered), type = "pearson")
correlation_results$P
##                           Normalized_QRS Normalized_BP Normalized_SVI
## Normalized_QRS                        NA  9.933678e-01   6.853895e-01
## Normalized_BP               9.933678e-01            NA   1.671754e-06
## Normalized_SVI              6.853895e-01  1.671754e-06             NA
## Normalized_LVESV            4.734779e-05  1.440789e-01   5.176039e-08
## Normalized_PR_interval      6.161903e-03  1.510868e-01   3.070402e-01
## Normalized_HRV              5.899446e-06  3.828959e-01   7.641050e-07
## Normalized_GGE              9.659843e-02  1.270667e-01   7.128341e-03
## Normalized_LVEDV            4.954003e-02  1.918542e-01   4.352831e-03
## Normalized_LVESVI           6.277110e-04  5.726864e-01   5.992465e-01
## Normalized_Focal_Epilepsy   9.955037e-02  3.021332e-01   2.018146e-01
## Normalized_HCM              2.396194e-02  7.604039e-01   2.522892e-02
## Normalized_LQTS             5.021915e-02  2.378957e-01   3.264184e-02
## Normalized_Afib             2.837904e-01  7.458152e-01   7.488965e-04
## Normalized_LVEDVI           4.291581e-02  2.605060e-03   1.172185e-05
## Normalized_SV               7.958341e-01  5.007362e-03   1.326023e-03
## Normalized_Heart_rate       7.704693e-04  7.404989e-02   0.000000e+00
## Normalized_QTc              3.611700e-01  3.700776e-05   1.148859e-12
## Normalized_LVEF             1.031027e-01  1.295488e-01   7.635401e-06
##                           Normalized_LVESV Normalized_PR_interval
## Normalized_QRS                4.734779e-05            0.006161903
## Normalized_BP                 1.440789e-01            0.151086783
## Normalized_SVI                5.176039e-08            0.307040155
## Normalized_LVESV                        NA            0.279786899
## Normalized_PR_interval        2.797869e-01                     NA
## Normalized_HRV                1.727377e-03            0.183954907
## Normalized_GGE                2.297310e-01            0.926572064
## Normalized_LVEDV              0.000000e+00            0.326133747
## Normalized_LVESVI             0.000000e+00            0.977482753
## Normalized_Focal_Epilepsy     3.413192e-01            0.873919485
## Normalized_HCM                0.000000e+00            0.779477674
## Normalized_LQTS               4.128755e-02            0.130976701
## Normalized_Afib               5.958125e-04            0.092859152
## Normalized_LVEDVI             0.000000e+00            0.837999134
## Normalized_SV                 6.087284e-09            0.126695164
## Normalized_Heart_rate         9.201374e-03            0.151032458
## Normalized_QTc                6.331019e-01            0.389631858
## Normalized_LVEF               0.000000e+00            0.257495908
##                           Normalized_HRV Normalized_GGE Normalized_LVEDV
## Normalized_QRS              5.899446e-06   9.659843e-02     4.954003e-02
## Normalized_BP               3.828959e-01   1.270667e-01     1.918542e-01
## Normalized_SVI              7.641050e-07   7.128341e-03     4.352831e-03
## Normalized_LVESV            1.727377e-03   2.297310e-01     0.000000e+00
## Normalized_PR_interval      1.839549e-01   9.265721e-01     3.261337e-01
## Normalized_HRV                        NA   1.417508e-05     7.232691e-01
## Normalized_GGE              1.417508e-05             NA     4.327716e-01
## Normalized_LVEDV            7.232691e-01   4.327716e-01               NA
## Normalized_LVESVI           4.718379e-01   3.926784e-01     0.000000e+00
## Normalized_Focal_Epilepsy   2.515402e-01   9.696364e-01     1.792096e-01
## Normalized_HCM              2.268539e-01   7.662938e-02     9.463941e-11
## Normalized_LQTS             9.602938e-01   6.911754e-01     5.693882e-05
## Normalized_Afib             6.384594e-01   5.193028e-02     4.874951e-01
## Normalized_LVEDVI           3.163305e-01   7.224922e-02     0.000000e+00
## Normalized_SV               5.701852e-01   3.181255e-01     0.000000e+00
## Normalized_Heart_rate       1.551204e-12   1.646706e-01     1.215043e-03
## Normalized_QTc              6.050812e-03   9.795078e-03     9.106381e-03
## Normalized_LVEF             1.320877e-02   3.297949e-01     1.024596e-07
##                           Normalized_LVESVI Normalized_Focal_Epilepsy
## Normalized_QRS                  0.000627711                0.09955037
## Normalized_BP                   0.572686369                0.30213319
## Normalized_SVI                  0.599246487                0.20181462
## Normalized_LVESV                0.000000000                0.34131916
## Normalized_PR_interval          0.977482753                0.87391948
## Normalized_HRV                  0.471837901                0.25154018
## Normalized_GGE                  0.392678433                0.96963636
## Normalized_LVEDV                0.000000000                0.17920961
## Normalized_LVESVI                        NA                0.30285180
## Normalized_Focal_Epilepsy       0.302851796                        NA
## Normalized_HCM                  0.000000000                0.95432712
## Normalized_LQTS                 0.022630206                0.59571578
## Normalized_Afib                 0.617610997                0.90444909
## Normalized_LVEDVI               0.000000000                0.70452838
## Normalized_SV                   0.016279998                0.71995021
## Normalized_Heart_rate           0.285679168                0.19226455
## Normalized_QTc                  0.071460435                0.97448035
## Normalized_LVEF                 0.000000000                0.87505014
##                           Normalized_HCM Normalized_LQTS Normalized_Afib
## Normalized_QRS              2.396194e-02    5.021915e-02    0.2837904158
## Normalized_BP               7.604039e-01    2.378957e-01    0.7458151593
## Normalized_SVI              2.522892e-02    3.264184e-02    0.0007488965
## Normalized_LVESV            0.000000e+00    4.128755e-02    0.0005958125
## Normalized_PR_interval      7.794777e-01    1.309767e-01    0.0928591521
## Normalized_HRV              2.268539e-01    9.602938e-01    0.6384594227
## Normalized_GGE              7.662938e-02    6.911754e-01    0.0519302805
## Normalized_LVEDV            9.463941e-11    5.693882e-05    0.4874950835
## Normalized_LVESVI           0.000000e+00    2.263021e-02    0.6176109965
## Normalized_Focal_Epilepsy   9.543271e-01    5.957158e-01    0.9044490887
## Normalized_HCM                        NA    2.747912e-01    0.2051656313
## Normalized_LQTS             2.747912e-01              NA    0.6090492099
## Normalized_Afib             2.051656e-01    6.090492e-01              NA
## Normalized_LVEDVI           3.149592e-06    5.104404e-06    0.8024317643
## Normalized_SV               5.644828e-01    2.725126e-06    0.7848328230
## Normalized_Heart_rate       7.267055e-01    2.228441e-01    0.0174432137
## Normalized_QTc              9.979204e-01    0.000000e+00    0.1904777957
## Normalized_LVEF             0.000000e+00    5.014860e-01    0.0219999301
##                           Normalized_LVEDVI Normalized_SV Normalized_Heart_rate
## Normalized_QRS                 4.291581e-02  7.958341e-01          7.704693e-04
## Normalized_BP                  2.605060e-03  5.007362e-03          7.404989e-02
## Normalized_SVI                 1.172185e-05  1.326023e-03          0.000000e+00
## Normalized_LVESV               0.000000e+00  6.087284e-09          9.201374e-03
## Normalized_PR_interval         8.379991e-01  1.266952e-01          1.510325e-01
## Normalized_HRV                 3.163305e-01  5.701852e-01          1.551204e-12
## Normalized_GGE                 7.224922e-02  3.181255e-01          1.646706e-01
## Normalized_LVEDV               0.000000e+00  0.000000e+00          1.215043e-03
## Normalized_LVESVI              0.000000e+00  1.628000e-02          2.856792e-01
## Normalized_Focal_Epilepsy      7.045284e-01  7.199502e-01          1.922646e-01
## Normalized_HCM                 3.149592e-06  5.644828e-01          7.267055e-01
## Normalized_LQTS                5.104404e-06  2.725126e-06          2.228441e-01
## Normalized_Afib                8.024318e-01  7.848328e-01          1.744321e-02
## Normalized_LVEDVI                        NA  0.000000e+00          2.187902e-01
## Normalized_SV                  0.000000e+00            NA          1.209219e-02
## Normalized_Heart_rate          2.187902e-01  1.209219e-02                    NA
## Normalized_QTc                 1.142112e-03  4.917063e-06          2.184145e-02
## Normalized_LVEF                3.606922e-03  1.979529e-02          1.680620e-02
##                           Normalized_QTc Normalized_LVEF
## Normalized_QRS              3.611700e-01    1.031027e-01
## Normalized_BP               3.700776e-05    1.295488e-01
## Normalized_SVI              1.148859e-12    7.635401e-06
## Normalized_LVESV            6.331019e-01    0.000000e+00
## Normalized_PR_interval      3.896319e-01    2.574959e-01
## Normalized_HRV              6.050812e-03    1.320877e-02
## Normalized_GGE              9.795078e-03    3.297949e-01
## Normalized_LVEDV            9.106381e-03    1.024596e-07
## Normalized_LVESVI           7.146043e-02    0.000000e+00
## Normalized_Focal_Epilepsy   9.744803e-01    8.750501e-01
## Normalized_HCM              9.979204e-01    0.000000e+00
## Normalized_LQTS             0.000000e+00    5.014860e-01
## Normalized_Afib             1.904778e-01    2.199993e-02
## Normalized_LVEDVI           1.142112e-03    3.606922e-03
## Normalized_SV               4.917063e-06    1.979529e-02
## Normalized_Heart_rate       2.184145e-02    1.680620e-02
## Normalized_QTc                        NA    8.490931e-02
## Normalized_LVEF             8.490931e-02              NA

Test for ANOVA assumptions

check_anova_assumptions <- function(data, trait) {
  # Ensure 'arrest_group' is a factor
  data$arrest_group <- as.factor(data$arrest_group)
  
  # Fit the ANOVA model
  formula <- as.formula(paste(trait, "~ arrest_group"))
  anova_model <- aov(formula, data = data)
  
  # Extract residuals
  residuals <- residuals(anova_model)
  
  # Shapiro-Wilk test for normality
  shapiro_test <- shapiro.test(residuals)
  shapiro_p_value <- shapiro_test$p.value
  
  # Levene's Test for homogeneity of variances
  levene_test <- leveneTest(formula, data = data)
  levene_p_value <- levene_test$`Pr(>F)`[1]
  
  # Bartlett's Test for homogeneity of variances
  bartlett_test <- bartlett.test(formula, data = data)
  bartlett_p_value <- bartlett_test$p.value
  
  # Create a summary table with the test results
  data.frame(
    Trait = trait,
    Shapiro_Wilk_p_value = shapiro_p_value,
    Levene_p_value = levene_p_value,
    Bartlett_p_value = bartlett_p_value
  )
}

anova_assumptions_results <- lapply(normalized_traits, function(trait) check_anova_assumptions(prs_data_discovery, trait))

# Combine the results into a single data frame
anova_assumptions_df <- do.call(rbind, anova_assumptions_results)

# Print the results
print(anova_assumptions_df)
##                        Trait Shapiro_Wilk_p_value Levene_p_value
## 1      Normalized_Heart_rate         3.670786e-05   3.402430e-01
## 2            Normalized_LQTS         8.582805e-01   8.524520e-01
## 3     Normalized_PR_interval         9.626637e-02   7.067574e-01
## 4             Normalized_QTc         1.378815e-07   4.084220e-01
## 5            Normalized_Afib         5.252293e-01   4.766202e-01
## 6             Normalized_HRV         2.678043e-13   1.221016e-01
## 7              Normalized_BP         2.145452e-02   8.637621e-01
## 8             Normalized_QRS         1.508077e-39   2.042997e-06
## 9             Normalized_HCM         3.144379e-03   9.212937e-01
## 10          Normalized_LVEDV         1.331310e-01   2.218475e-04
## 11         Normalized_LVEDVI         5.919657e-02   7.738241e-03
## 12           Normalized_LVEF         2.471126e-02   3.111787e-01
## 13          Normalized_LVESV         1.169793e-02   9.853473e-01
## 14         Normalized_LVESVI         7.239432e-04   5.086718e-01
## 15             Normalized_SV         1.118507e-05   5.214914e-01
## 16            Normalized_SVI         8.610508e-04   8.519458e-01
## 17            Normalized_GGE         4.448074e-02   6.815913e-01
## 18 Normalized_Focal_Epilepsy         3.025018e-03   6.100802e-01
##    Bartlett_p_value
## 1      6.287623e-02
## 2      7.465532e-01
## 3      8.498751e-01
## 4      5.892923e-01
## 5      5.905392e-01
## 6      6.549201e-02
## 7      8.042278e-01
## 8      1.040405e-24
## 9      8.033151e-01
## 10     9.995313e-04
## 11     1.890503e-02
## 12     4.997414e-01
## 13     8.378271e-01
## 14     2.967646e-01
## 15     6.599025e-01
## 16     6.489996e-01
## 17     5.545373e-01
## 18     4.610240e-01

Since some do not meet ANOVA criteria, we go with nonparametric

# Define Wilcoxon Rank Sum test function with median difference calculation
perform_wilcoxon_test <- function(data, trait) {
  # Convert data to appropriate format
  data <- data %>%
    dplyr::select(arrest_group, all_of(trait)) %>%
    drop_na()

  # Calculate medians for each group
  group_medians <- data %>%
    group_by(arrest_group) %>%
    summarise(median_value = median(!!sym(trait)))

  # Calculate the difference in medians
  diff_median <- diff(group_medians$median_value)
  
  # Perform Wilcoxon rank-sum test
  wilcoxon_test_result <- wilcox_test(data, as.formula(paste(trait, "~ arrest_group")))

  # Extract p-value
  p_value <- wilcoxon_test_result$p

  # Adjust p-value using Bonferroni correction
  p_adjusted <- p.adjust(p_value, method = "bonferroni", n = 18)

  # Create a data frame with the test results
  result_df <- data.frame(
    Trait = trait,
    p_value = p_value,
    p_adjusted = p_adjusted,
    significant = p_adjusted < 0.05,
    diff_median = diff_median  
  )

  return(result_df)
}

# Perform Wilcoxon test for each trait and store results
wilcoxon_results <- lapply(normalized_traits, function(trait) perform_wilcoxon_test(prs_data_discovery, trait))

# Combine all Wilcoxon results into a single data frame
combined_wilcoxon_results <- do.call(rbind, wilcoxon_results)

# Print combined Wilcoxon results with p-values and median differences
print("Wilcoxon Rank Sum Test Results with Manual P-value Adjustments and Median Differences:")
## [1] "Wilcoxon Rank Sum Test Results with Manual P-value Adjustments and Median Differences:"
print(combined_wilcoxon_results)
##                        Trait  p_value p_adjusted significant  diff_median
## 1      Normalized_Heart_rate 1.24e-02 2.2320e-01       FALSE  0.088165794
## 2            Normalized_LQTS 1.91e-01 1.0000e+00       FALSE  0.112902629
## 3     Normalized_PR_interval 2.34e-02 4.2120e-01       FALSE -0.160790733
## 4             Normalized_QTc 4.67e-03 8.4060e-02       FALSE  0.179095083
## 5            Normalized_Afib 1.07e-01 1.0000e+00       FALSE  0.080017842
## 6             Normalized_HRV 1.51e-01 1.0000e+00       FALSE -0.089874654
## 7              Normalized_BP 1.21e-06 2.1780e-05        TRUE -0.275654610
## 8             Normalized_QRS 1.12e-08 2.0160e-07        TRUE -0.128954882
## 9             Normalized_HCM 4.76e-01 1.0000e+00       FALSE  0.031802432
## 10          Normalized_LVEDV 6.38e-01 1.0000e+00       FALSE -0.044351499
## 11         Normalized_LVEDVI 7.37e-02 1.0000e+00       FALSE  0.118958269
## 12           Normalized_LVEF 1.76e-03 3.1680e-02        TRUE  0.124406676
## 13          Normalized_LVESV 7.02e-03 1.2636e-01       FALSE -0.124673369
## 14         Normalized_LVESVI 9.49e-01 1.0000e+00       FALSE -0.011000752
## 15             Normalized_SV 3.55e-02 6.3900e-01       FALSE  0.146894687
## 16            Normalized_SVI 1.77e-04 3.1860e-03        TRUE -0.360642163
## 17            Normalized_GGE 3.20e-01 1.0000e+00       FALSE -0.061833861
## 18 Normalized_Focal_Epilepsy 9.01e-01 1.0000e+00       FALSE  0.001265513

Make the lolliplot

# Define the groups for metrics and syndromes
metrics <- c("Normalized_Heart_rate", "Normalized_PR_interval", "Normalized_QTc", "Normalized_HRV", "Normalized_BP", "Normalized_QRS",
             "Normalized_LVEDV", "Normalized_LVEDVI", "Normalized_LVEF", "Normalized_LVESV", "Normalized_LVESVI", "Normalized_SV", "Normalized_SVI")
syndromes <- c("Normalized_LQTS", "Normalized_Afib", "Normalized_HCM", "Normalized_GGE", "Normalized_Focal_Epilepsy")

# Categorize each trait as 'Metrics' or 'Syndromes'
combined_wilcoxon_results$Category <- ifelse(combined_wilcoxon_results$Trait %in% metrics, "Metrics",
                                             ifelse(combined_wilcoxon_results$Trait %in% syndromes, "Syndromes", "Combined"))

# Reorder the levels of Trait based on the Category
combined_wilcoxon_results <- combined_wilcoxon_results %>%
    mutate(Trait = factor(Trait, levels = unique(Trait[order(Category)])))

# Transform P-values to -log10 scale
combined_wilcoxon_results$NegLogP <- -log10(combined_wilcoxon_results$p_value)

# Calculate the -log10(0.05) for the reference line
threshold <- -log10(0.05)
threshold2 <- -log10(0.05/18)

# Create a new variable for coloring based on P-value significance (already computed in the original df)
combined_wilcoxon_results$Significant <- ifelse(combined_wilcoxon_results$p_value < 0.05, "Significant", "Not Significant")

# Combine metrics and syndromes into a single ordered list
ordered_traits <- c(
  "Normalized_QRS", 
  "Normalized_BP",
  "Normalized_SVI",
  "Normalized_LVESV", 
  "Normalized_PR_interval", 
  "Normalized_HRV", 
  "Normalized_GGE", 
  "Normalized_LVEDV",
  "Normalized_LVESVI", 
  "Normalized_Focal_Epilepsy", 
  "Normalized_HCM", 
  "Normalized_LQTS", 
  "Normalized_Afib", 
  "Normalized_LVEDVI", 
  "Normalized_SV", 
  "Normalized_Heart_rate", 
  "Normalized_QTc", 
  "Normalized_LVEF"
)

# Update the 'Trait' column to have this specific order
combined_wilcoxon_results$Trait <- factor(combined_wilcoxon_results$Trait, levels = ordered_traits)

# Create the plot, coloring by diff_median
p3 <- ggplot(combined_wilcoxon_results, aes(x = Trait, y = NegLogP, color = diff_median)) +
    geom_segment(aes(xend = Trait, yend = 0), size = 1) +  
    geom_point(size = 3) +
    geom_hline(yintercept = threshold, linetype = "dotted", color = "Black") +  
    geom_hline(yintercept = threshold2, linetype = "dotted", color = "Red") + 
    scale_color_gradient2(low = "#3C8C78", mid = "white", high = "black", midpoint = 0) +  
    theme_cowplot() +  
    theme(axis.text.x = element_text(angle = 90, hjust = 1),  
          strip.text.x = element_text(size = 10)) +  
    labs(y = "-log10(P-value)", x = "Trait", color = "Median Difference", title = "Lollipop Plot of -log10(P-values) by Trait")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the plot
print(p3)

# Function to generate plots for a given trait
generate_plots_for_trait <- function(data, trait, group_colors) {
  # Rank the scores for the specified trait
  data$rank <- rank(data[[trait]], na.last = "keep")
  
  # Density plot for the trait
  density_plot <- ggplot(data, aes(x = rank, fill = arrest_group, y = after_stat(density))) +
    geom_density(alpha = 0.6) +
    scale_fill_manual(values = group_colors) +
    labs(title = paste("Relative Density of Overall", trait, "Trait Ranks Across Arrest Groups"),
         x = paste("Overall Rank of", trait, "Scores"),
         y = "Relative Density") +
    theme_cowplot(12) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # CDF plot for the trait
  cdf_plot <- ggplot(data, aes(x = rank, color = arrest_group)) +
    stat_ecdf(geom = "step", size = 1) +
    scale_color_manual(values = group_colors) +
    labs(title = paste("CDF of Overall", trait, "Trait Ranks Across Arrest Groups"),
         x = paste("Overall Rank of", trait, "Scores"),
         y = "Cumulative Proportion") +
    theme_cowplot(12) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # Combine the plots
  combined_plot <- plot_grid(density_plot, cdf_plot, ncol = 1, align = 'v', labels = c("A", "B"))
  
  # Return the combined plot
  return(combined_plot)
}


# List of traits to plot
traits <- c("BP", "Afib", "QRS","LVEF","Heart_rate")

# Initialize an empty list to store plots
plots_list <- list()

# Loop through each trait and generate plots, storing them in the list
for(trait in traits) {
  plots_list[[trait]] <- generate_plots_for_trait(prs_data_discovery, trait, group_colors)
}

######## Access each plot by its trait name from the plots_list ##########
# For example, to print the plot for QRS, use:
print(plots_list[["QRS"]])

print(plots_list[["BP"]])

Coding regions******************************************************************************************************************

Bring in data

coding_stats <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/statistics_by_individual_replication.txt", header = TRUE, sep = "\t")

coding_stats_discovery <- coding_stats[coding_stats$Replication == 0, ]

Categorize the data

categorized_coding_stats_discovery <- coding_stats_discovery %>%
  mutate(Category_Group = case_when(
    Category == "control" ~ "Control",
    Category == "case" ~ "Case",
    TRUE ~ NA_character_
  ))

Merge some of the column data to get the desired allele frequency intervals

categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
  mutate(
    Epilepsy_01_00001 = Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001,
    Epilepsy_00001 = Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton,
    Epilepsy_total_missense = Epilepsy_missense_common + Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001 +
                              Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton,
    
    Cardiomyopathy_01_00001 = Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001,
    Cardiomyopathy_00001 = Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton,
    Cardiomyopathy_total_missense = Cardiomyopathy_missense_common + Cardiomyopathy_missense_variant_0.01 +
                                    Cardiomyopathy_missense_variant_0.001 + Cardiomyopathy_missense_variant_0.00001 +
                                    Cardiomyopathy_missense_singleton
  )

# Clean up the data: remove 'NA's, 'Inf's, and select only needed columns
categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
  dplyr::select(-ID, -Category) %>%
  na.omit() %>%
  dplyr::filter(if_all(everything(), ~ !is.infinite(.)))

# Aggregate the data to compute mean and standard deviation
collapsed_coding_stats <- categorized_coding_stats_discovery %>%
  group_by(Category_Group) %>%
  summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE), 
                                           std = ~sd(., na.rm = TRUE))))

# Clean up the collapsed data
collapsed_coding_stats <- na.omit(collapsed_coding_stats)

Perform permutation simulation and test

# List of coding variables to analyze
coding_variables_to_analyze <- c("Cardiomyopathy_HIGH", "Cardiomyopathy_start_lost", 
                                 "Cardiomyopathy_stop_gained", "Cardiomyopathy_01_00001", 
                                 "Cardiomyopathy_00001", "Cardiomyopathy_total_missense", "PLP_Cardiomyopathy",
                                 "Epilepsy_HIGH", "Epilepsy_start_lost", 
                                 "Epilepsy_stop_gained", "Epilepsy_01_00001", 
                                 "Epilepsy_00001", "Epilepsy_total_missense", "PLP_Epilepsy")

# Define the permutation function
permutation_test <- function(data, var, group_var, num_permutations = 10000) {
  
  # Check for sufficient group sizes
  if (sum(data[[group_var]] == "Case", na.rm = TRUE) > 1 && 
      sum(data[[group_var]] == "Control", na.rm = TRUE) > 1) {
    
    # Calculate the observed difference in means between the two groups
    observed_stat <- abs(mean(data[[var]][data[[group_var]] == "Case"], na.rm = TRUE) - 
                         mean(data[[var]][data[[group_var]] == "Control"], na.rm = TRUE))
    
    # Create an empty vector to store the permutation statistics
    perm_stats <- numeric(num_permutations)
    
    # Perform the permutations
    for (i in 1:num_permutations) {
      # Randomly shuffle the group labels
      permuted_group <- sample(data[[group_var]])
      
      # Calculate the difference in means for the permuted data
      permuted_stat <- abs(mean(data[[var]][permuted_group == "Case"], na.rm = TRUE) - 
                           mean(data[[var]][permuted_group == "Control"], na.rm = TRUE))
      
      # Store the permuted statistic
      perm_stats[i] <- permuted_stat
    }
    
    # Calculate the p-value (proportion of permuted stats >= observed stat)
    p_value <- mean(perm_stats >= observed_stat)
    
    return(list(observed_stat = observed_stat, perm_stats = perm_stats, p_value = p_value))
    
  } else {
    warning(paste("Skipping variable", var, "due to insufficient group size"))
    return(NULL)
  }
}

# Initialize a dataframe to store p-values for each variable
p_value_results <- data.frame(Variable = character(), Observed_Stat = numeric(), p_value = numeric(), stringsAsFactors = FALSE)

# Loop through each variable and perform the permutation test
for (var in coding_variables_to_analyze) {
  # Run the permutation test
  test_result <- permutation_test(categorized_coding_stats_discovery, var, "Category_Group")
  
  if (!is.null(test_result)) {
    # Extract the permuted statistics, observed statistic, and p-value
    perm_stats <- test_result$perm_stats
    observed_stat <- test_result$observed_stat
    p_value <- test_result$p_value
    
    # Store the p-value and observed statistic in the dataframe
    p_value_results <- rbind(p_value_results, data.frame(Variable = var, 
                                                         Observed_Stat = observed_stat, 
                                                         p_value = p_value))
    
    # Create a data frame for plotting the permutation distribution
    perm_df <- data.frame(perm_stats = perm_stats)
    
    # Plot the permutation distribution with the observed statistic marked and p-value
    p <- ggplot(perm_df, aes(x = perm_stats)) +
      geom_histogram(bins = 30, fill = "lightblue", color = "black") +
      geom_vline(aes(xintercept = observed_stat), color = "red", linetype = "dashed", size = 1) +
      labs(title = paste("Permutation Test for", var),
           x = "Permuted Statistic",
           y = "Frequency",
           subtitle = paste("Observed Statistic =", round(observed_stat, 4), 
                            "| P-value =", format(p_value, digits = 4))) +  # Add p-value to subtitle
      theme_minimal()
    
    # Print the plot
    print(p)
  }
}

# Print the table of p-values
print("P-value Results for All Variables:")
## [1] "P-value Results for All Variables:"
print(p_value_results)
##                         Variable Observed_Stat p_value
## 1            Cardiomyopathy_HIGH    0.67487504  0.0001
## 2      Cardiomyopathy_start_lost    0.03608007  0.0007
## 3     Cardiomyopathy_stop_gained    0.27295893  0.0000
## 4        Cardiomyopathy_01_00001    2.19304371  0.0000
## 5           Cardiomyopathy_00001    0.55399147  0.0015
## 6  Cardiomyopathy_total_missense    8.02403705  0.0000
## 7             PLP_Cardiomyopathy    0.19359502  0.0000
## 8                  Epilepsy_HIGH    0.70615260  0.0104
## 9            Epilepsy_start_lost    0.01129570  0.0509
## 10          Epilepsy_stop_gained    0.34350191  0.0000
## 11             Epilepsy_01_00001    1.45746349  0.0000
## 12                Epilepsy_00001    0.56937910  0.0045
## 13       Epilepsy_total_missense    2.48950064  0.0001
## 14                  PLP_Epilepsy    0.01211653  0.5146

Create the Function to plot each column as a ratio to the means from Group 1

plot_column_ratio <- function(data, col_name) {
  # Calculate the mean and standard deviation for Control
  group1_mean <- mean(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
  group1_sd <- sd(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
  
  # Calculate the ratio for each group relative to Group 1
  data_summary <- data %>%
  group_by(Category_Group) %>%
  summarise(mean_value = mean(.data[[col_name]], na.rm = TRUE),
            sd_value = sd(.data[[col_name]], na.rm = TRUE), 
            n = n()) %>%
  mutate(ratio = mean_value / group1_mean,
         sem_value = sd_value / sqrt(n), 
         sem_ratio = sem_value / group1_mean, # SEM scaled to the group 1, just like the mean value
         ci_lower = ratio - (1.96 * sem_ratio), # Lower bound of the CI
         ci_upper = ratio + (1.96 * sem_ratio)) # Upper bound of the CI
  
  return(list(summary_data = data_summary))
}

Plot the data relative to control

# Initialize an empty dataframe to store summary data
combined_coding_stats_summary_df <- data.frame(Category_Group = character(), col_name = character(), mean = numeric(), std = numeric(), stringsAsFactors = FALSE)


# List of all columns to plot
columns_to_plot <- setdiff(names(categorized_coding_stats_discovery), c("ID", "Category", "Category_Group"))

# Loop through each column and plot relative to Category_Group1 (Crontrol)
for (col in columns_to_plot) {
  plot_data <- plot_column_ratio(categorized_coding_stats_discovery, col)
  # Append summary data to combined_coding_stats_summary_df
  combined_coding_stats_summary_df <- bind_rows(combined_coding_stats_summary_df, mutate(plot_data$summary_data, col_name = col))
}

# Subset the data for the specified variables
selected_coding_data <- combined_coding_stats_summary_df %>%
  filter(col_name %in% c(
                         "Cardiomyopathy_total_missense",
                         "Cardiomyopathy_01_00001",
                         "Cardiomyopathy_00001",
                         "Epilepsy_total_missense",
                         "Epilepsy_01_00001",
                         "Epilepsy_00001",
                         "PLP_Cardiomyopathy",
                         "PLP_Epilepsy",
                         "Cardiomyopathy_start_lost",
                         "Cardiomyopathy_stop_gained",
                         "Cardiomyopathy_HIGH",
                         "Epilepsy_start_lost",
                         "Epilepsy_HIGH",
                         "Epilepsy_stop_gained"
                         ),
         Category_Group != "Control") 


# Define the specific order for "Epilepsy" and CMAR variants
levels_order <- c(
                  "PLP_Epilepsy",
                  "Epilepsy_00001",
                  "Epilepsy_01_00001",
                  "Epilepsy_total_missense",
                  "Epilepsy_start_lost",
                  "Epilepsy_stop_gained",
                  "Epilepsy_HIGH",
                  "PLP_Cardiomyopathy",
                  "Cardiomyopathy_00001",
                  "Cardiomyopathy_01_00001",
                  "Cardiomyopathy_total_missense",
                  "Cardiomyopathy_start_lost",
                  "Cardiomyopathy_stop_gained",
                  "Cardiomyopathy_HIGH"
)


selected_coding_data$col_name <- factor(selected_coding_data$col_name, levels = levels_order)

# Plot
coding_stats_plot <- ggplot(selected_coding_data, aes(y = col_name, x = ratio, color = Category_Group)) +
  geom_point(position = position_dodge(width = 1), size = 3) +
  geom_errorbar(aes(xmin = ratio - sem_ratio, xmax = ratio + sem_ratio), position = position_dodge(width = 0.5), width = 1) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  scale_color_manual(values = "#3C8C78") +
  labs(title = "Ratio Compared to Control +/-SEM",
       y = "Variant",
       x = "Ratio to Control Mean",
       color = "Category Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 8), 
        axis.text.y = element_text(size = 8, hjust = 1),
        axis.title.x = element_text(size = 8), 
        axis.title.y = element_text(size = 8),
        legend.title = element_text(size = 8),
        legend.text = element_text(size = 8),
        plot.title = element_text(size = 8, hjust = 0.5),
        plot.subtitle = element_text(size = 8),
        plot.caption = element_text(size = 8),
        plot.margin = margin(15, 15, 15, 15)) +
  scale_x_continuous(limits = c(-2, 12))


print(coding_stats_plot)

# Print the table of p-values
print("P-value Results for All Variables:")
## [1] "P-value Results for All Variables:"
print(p_value_results)
##                         Variable Observed_Stat p_value
## 1            Cardiomyopathy_HIGH    0.67487504  0.0001
## 2      Cardiomyopathy_start_lost    0.03608007  0.0007
## 3     Cardiomyopathy_stop_gained    0.27295893  0.0000
## 4        Cardiomyopathy_01_00001    2.19304371  0.0000
## 5           Cardiomyopathy_00001    0.55399147  0.0015
## 6  Cardiomyopathy_total_missense    8.02403705  0.0000
## 7             PLP_Cardiomyopathy    0.19359502  0.0000
## 8                  Epilepsy_HIGH    0.70615260  0.0104
## 9            Epilepsy_start_lost    0.01129570  0.0509
## 10          Epilepsy_stop_gained    0.34350191  0.0000
## 11             Epilepsy_01_00001    1.45746349  0.0000
## 12                Epilepsy_00001    0.56937910  0.0045
## 13       Epilepsy_total_missense    2.48950064  0.0001
## 14                  PLP_Epilepsy    0.01211653  0.5146

Input the data

# Pull in the gene data
gene_data <- read.csv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/individual_variants_by_gene.csv")

# Read the cohort data
cohort <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SFRN_cohort.txt", sep = "\t", header = TRUE)

# add the arrest category to each
gene_data$Category <- cohort$arrest_ontology[match(gene_data$ID, cohort$CGM_id)]

gene_data_discovery <- gene_data %>%
  filter(Set == "Discovery")

Summarize the data by GENE, counting the number of variants for each gene Filter for Cases and then group and summarize

variants_per_gene_Case <- gene_data_discovery %>%
  filter(Category == "case") %>%
  group_by(GENE) %>%
  summarise(
    HIGH = sum(HIGH, na.rm = TRUE),
    PLP = sum(PLP, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(variants_per_gene_Case)
## # A tibble: 356 × 3
##    GENE    HIGH   PLP
##    <chr>  <int> <int>
##  1 A2ML1     57     0
##  2 AARS       6     0
##  3 ABAT       0     0
##  4 ABCC9      4     0
##  5 ACADVL     4     0
##  6 ACTC1      1     0
##  7 ACTL6B     0     0
##  8 ACTN2      7     0
##  9 ADAM22     3     0
## 10 ADSL       1     1
## # ℹ 346 more rows

Filter for Controls and then group and summarize

variants_per_gene_Control <- gene_data_discovery %>%
  filter(Category == "control") %>%
  group_by(GENE) %>%
  summarise(
    HIGH = sum(HIGH, na.rm = TRUE),
    PLP = sum(PLP, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(variants_per_gene_Control)
## # A tibble: 341 × 3
##    GENE    HIGH   PLP
##    <chr>  <int> <int>
##  1 A2ML1     69     0
##  2 AARS       0     0
##  3 ABAT       0     0
##  4 ABCC9      3     0
##  5 ACADVL     6     3
##  6 ACTL6B     0     2
##  7 ACTN2      1     0
##  8 ADAM22     0     0
##  9 ADSL       0     0
## 10 AGL        0     0
## # ℹ 331 more rows

Load gene lists from text files

genes_CMAR1 <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_CMAR1_list.txt")
genes_CMAR2 <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_CMAR2_list.txt")
genes_EIEE_OMIM <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_EIEE_OMIM_list.txt")
genes_Epilepsy <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_Epilepsy_list.txt")

Annotate gene with source panel

genes_CMAR1 <- data.frame(Gene = genes_CMAR1, Source = "Cardiomyopathy")
genes_CMAR2 <- data.frame(Gene = genes_CMAR2, Source = "Cardiomyopathy")
genes_EIEE_OMIM <- data.frame(Gene = genes_EIEE_OMIM, Source = "Epilepsy")
genes_Epilepsy <- data.frame(Gene = genes_Epilepsy, Source = "Epilepsy")

# Replace "\"AARS1\"" with "AARS"
genes_EIEE_OMIM$Gene <- ifelse(genes_EIEE_OMIM$Gene == "\"AARS1\"", "AARS", genes_EIEE_OMIM$Gene)

# Replace "\"KIAA2022\"" with "NEXMIF"
genes_Epilepsy$Gene <- ifelse(genes_Epilepsy$Gene == "\"NEXMIF\"", "KIAA2022", genes_Epilepsy$Gene)

Append panel source to the gene_data

# Combine all lists into one dataframe
all_genes <- rbind(genes_CMAR1, genes_CMAR2, genes_EIEE_OMIM, genes_Epilepsy)

# Sort by Gene and Source to ensure "Cardiomyopathy" comes first
all_genes <- all_genes[order(all_genes$Gene, all_genes$Source), ]

# Remove duplicates, keeping the first occurrence
all_genes <- all_genes[!duplicated(all_genes$Gene), ]

# Clean the 'Gene' column in 'all_genes' by removing quotes and backslashes
all_genes$Gene <- gsub('\"', '', all_genes$Gene)

# Ensure that the 'GENE' column in 'gene_data_discovery' and 'Gene' in 'all_genes' are character
gene_data_discovery$GENE <- as.character(gene_data_discovery$GENE)
all_genes$Gene <- as.character(all_genes$Gene)

# find the index of each gene in 'all_genes' and use this index to assign the corresponding 'Source' to a new column in 'gene_data'
gene_data_discovery$Source <- all_genes$Source[match(gene_data_discovery$GENE, all_genes$Gene)]

# Now, 'gene_data' will have a new column 'Source' with the source of each gene
# based on the lookup from 'all_genes'

# View the first few rows to verify the new 'Source' has been added
head(gene_data_discovery)
##           ID    GENE HIGH PLP       Set Category         Source
## 1 CGM0000001   A2ML1    0   0 Discovery  control Cardiomyopathy
## 2 CGM0000001    ABAT    0   0 Discovery  control       Epilepsy
## 3 CGM0000001  ADAM22    0   0 Discovery  control       Epilepsy
## 4 CGM0000001   AKAP9    0   0 Discovery  control Cardiomyopathy
## 5 CGM0000001 ALDH5A1    0   0 Discovery  control       Epilepsy
## 6 CGM0000001   ALMS1    0   0 Discovery  control Cardiomyopathy

Add the source to the control variants list

variants_per_gene_Control$Source <- all_genes$Source[match(variants_per_gene_Control$GENE, all_genes$Gene)]

head(variants_per_gene_Control)
## # A tibble: 6 × 4
##   GENE    HIGH   PLP Source        
##   <chr>  <int> <int> <chr>         
## 1 A2ML1     69     0 Cardiomyopathy
## 2 AARS       0     0 Epilepsy      
## 3 ABAT       0     0 Epilepsy      
## 4 ABCC9      3     0 Cardiomyopathy
## 5 ACADVL     6     3 Cardiomyopathy
## 6 ACTL6B     0     2 Epilepsy

Add the source to the Case variants list

variants_per_gene_Case$Source <- all_genes$Source[match(variants_per_gene_Case$GENE, all_genes$Gene)]

head(variants_per_gene_Case)
## # A tibble: 6 × 4
##   GENE    HIGH   PLP Source        
##   <chr>  <int> <int> <chr>         
## 1 A2ML1     57     0 Cardiomyopathy
## 2 AARS       6     0 Epilepsy      
## 3 ABAT       0     0 Epilepsy      
## 4 ABCC9      4     0 Cardiomyopathy
## 5 ACADVL     4     0 Cardiomyopathy
## 6 ACTC1      1     0 Cardiomyopathy

combine the variants together now

# Add a new column to indicate the category
variants_per_gene_Control <- variants_per_gene_Control %>%
  mutate(Category = "Control")  

variants_per_gene_Case <- variants_per_gene_Case %>%
  mutate(Category = "Case")  

# Combine the two datasets
combined_variants <- bind_rows(variants_per_gene_Control, variants_per_gene_Case)

# Print the combined dataset
print(combined_variants)
## # A tibble: 697 × 5
##    GENE    HIGH   PLP Source         Category
##    <chr>  <int> <int> <chr>          <chr>   
##  1 A2ML1     69     0 Cardiomyopathy Control 
##  2 AARS       0     0 Epilepsy       Control 
##  3 ABAT       0     0 Epilepsy       Control 
##  4 ABCC9      3     0 Cardiomyopathy Control 
##  5 ACADVL     6     3 Cardiomyopathy Control 
##  6 ACTL6B     0     2 Epilepsy       Control 
##  7 ACTN2      1     0 Cardiomyopathy Control 
##  8 ADAM22     0     0 Epilepsy       Control 
##  9 ADSL       0     0 Epilepsy       Control 
## 10 AGL        0     0 Cardiomyopathy Control 
## # ℹ 687 more rows

Plot the number of variant types in cases and controls

# Filter dataset where High > 0
combined_variants_High <- combined_variants %>% filter(HIGH > 0)


High_variant_plot <- ggplot(combined_variants_High, aes(x = GENE, y = Category, fill = HIGH)) +
  geom_tile() + 
  scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"), 
                       values = scales::rescale(c(0, 0.5, 1))) + 
  facet_wrap(~ Source, scales = "free_x", ncol = 1) + 
  labs(title = "High Variants Heatmap",
       x = "Gene",
       y = "Category",
       fill = "Count") +
  theme_cowplot(12) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        strip.background = element_blank(), 
        strip.placement = "outside")

# Print the plot
print(High_variant_plot)

Plot the number of variant types in cases and controls

# Filter datasets where PLP > 0
combined_variants_PLP <- combined_variants %>% filter(PLP > 0)

# Create the PLP plot
PLP_plot <- ggplot(combined_variants_PLP, aes(x = GENE, y = Category, fill = PLP)) +
  geom_tile() + 
  scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"), 
                       values = scales::rescale(c(0, 0.5, 1))) + 
  facet_wrap(~ Source, scales = "free_x", ncol = 1) + 
  labs(title = "PLP Heatmap",
       x = "Gene",
       y = "Category",
       fill = "Count") +
  theme_cowplot(12) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        strip.background = element_blank(), 
        strip.placement = "outside")

# Print the plot
print(PLP_plot)

Now bring in the module data

modules <- read_excel("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/cardio_gene_sets.xlsx")

annotate the gene_data with the modules

module_data <- left_join(gene_data_discovery, modules, by = c("GENE" = "Gene"))

# View the first few rows of the updated data
head(module_data)
##           ID    GENE HIGH PLP       Set Category         Source
## 1 CGM0000001   A2ML1    0   0 Discovery  control Cardiomyopathy
## 2 CGM0000001    ABAT    0   0 Discovery  control       Epilepsy
## 3 CGM0000001  ADAM22    0   0 Discovery  control       Epilepsy
## 4 CGM0000001   AKAP9    0   0 Discovery  control Cardiomyopathy
## 5 CGM0000001 ALDH5A1    0   0 Discovery  control       Epilepsy
## 6 CGM0000001   ALMS1    0   0 Discovery  control Cardiomyopathy
##               Module
## 1          Rasopathy
## 2               <NA>
## 3               <NA>
## 4  Membrane_polarity
## 5               <NA>
## 6 fate_specification

NOW PLOT THE PLPs by their expert-defined categories. The size is the relative number of variants per gene in the category. the Color is the absolute number of variants

#Filter NAs ans aggregate the data
module_data <- module_data %>%
  filter(!is.na(Module))

module_data <- module_data %>%
  mutate(Category_Group = ifelse(Category == "Control", "control", "case"))

# Count up the variants
module_summary <- module_data %>%
  group_by(Module, Category_Group) %>%
  summarise(Total_PLP_variant = sum(PLP, na.rm = TRUE)) %>%
  ungroup() 
## `summarise()` has grouped output by 'Module'. You can override using the
## `.groups` argument.
# First, calculate the number of genes per module
genes_per_module <- modules %>%
  group_by(Module) %>%
  summarise(Num_Genes = n()) %>%
  ungroup()

# Merge this information with your module_summary data frame
module_summary <- module_summary %>%
  left_join(genes_per_module, by = c("Module" = "Module"))

# Calculate the size relative to the number of genes per module
module_summary <- module_summary %>%
  mutate(Relative_Size = Total_PLP_variant / Num_Genes)

# Filter the data to only include "Case"
module_summary_filtered <- module_summary %>%
  filter(Category_Group == "case")

module_order <- module_summary_filtered %>%
  group_by(Module) %>%
  summarise(Sort_Metric = mean(Total_PLP_variant, na.rm = TRUE)) %>%
  arrange(desc(Sort_Metric)) %>%
  .$Module

module_summary_filtered$Module <- factor(module_summary_filtered$Module, levels = module_order)

# Now plot with the reordered Module
modules_plot <- ggplot(module_summary_filtered, aes(x = Module, y = Category_Group, size = Total_PLP_variant, color = Relative_Size)) +
  geom_point(shape = 15, alpha = 1) +
  scale_size(range = c(3, 20), name = "Number of PLPs") + 
  scale_color_gradient(low = "Grey", high = "#05618F", name = "# of PLPs relative to Module size") +
  theme_minimal() +
  labs(title = "Total PLP by Module (Cases)",
       x = "Module",
       y = "") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(modules_plot)

Build the interaction plot data

#Count the High effect variants
aggregated_data <- module_data %>%
  group_by(ID, Module, Category) %>%
  summarise(High_count = sum(HIGH),
            .groups = 'drop')

#split off controls
data_control <- aggregated_data %>%
  filter(Category %in% c("control"))

#identify IDs with any 'High_count' greater than 0
ids_with_both_1 <- data_control %>%
  group_by(ID) %>%
  summarise(High_count_any = any(High_count > 0)) %>%
  filter(High_count_any) %>%
  dplyr::select(ID)

# Filter to include only rows that have IDs with High_count > 0
modules_with_both_1 <- data_control %>%
  semi_join(ids_with_both_1, by = "ID")

#find all possible pairs of modules where 'High_count' is greater than 0
# and generate a new df with each row representing a unique pair of modules for each ID.
module_pairs_for_ids_1 <- modules_with_both_1 %>%
  group_by(ID) %>%
  do({
    data.frame(t(combn(unique(.$Module), 2)), stringsAsFactors = FALSE)
  }) %>%
  dplyr::rename(Module1 = X1, Module2 = X2) %>%
  left_join(modules_with_both_1, by = c("ID", "Module1" = "Module")) %>%
  left_join(modules_with_both_1, by = c("ID", "Module2" = "Module"), suffix = c("_1", "_2")) %>%
  filter(High_count_2 > 0 | High_count_1 > 0)

# Count occurrences of module pairs across IDs
module_pair_counts_1 <- module_pairs_for_ids_1 %>%
  group_by(Module1, Module2) %>%
  summarise(Count = n_distinct(ID), .groups = 'drop')



#split off cases
data_case <- aggregated_data %>%
  filter(Category %in% c("case"))

#identify IDs with any 'High_count' greater than 0
ids_with_both_case <- data_case %>%
  group_by(ID) %>%
  summarise(Missense_any = any(High_count > 0)) %>%
  filter(Missense_any) %>%
  dplyr::select(ID)

# Filter to include only rows that have IDs with High_count > 0
modules_with_both_case <- data_case %>%
  semi_join(ids_with_both_case, by = "ID")


#find all possible pairs of modules where 'High_count' is greater than 0
# and generate a new df with each row representing a unique pair of modules for each ID.
module_pairs_for_ids_case <- modules_with_both_case %>%
  group_by(ID) %>%
  do({
    data.frame(t(combn(unique(.$Module), 2)), stringsAsFactors = FALSE)
  }) %>%
  dplyr::rename(Module1 = X1, Module2 = X2) %>%
  left_join(modules_with_both_case, by = c("ID", "Module1" = "Module")) %>%
left_join(modules_with_both_case, by = c("ID", "Module2" = "Module"), suffix = c("_1", "_2")) %>%
  filter(High_count_2 > 0 | High_count_1 > 0)

# Count occurrences of module pairs across IDs
module_pair_counts_case <- module_pairs_for_ids_case %>%
  group_by(Module1, Module2) %>%
  summarise(Count = n_distinct(ID), .groups = 'drop')


# Ensure all combinations are present with at least a zero count for each category
comparison_df <- merge(module_pair_counts_1, module_pair_counts_case, by = c("Module1", "Module2"), all = TRUE)
comparison_df[is.na(comparison_df)] <- 0  # Replace NA with 0

# Rename columns
names(comparison_df)[names(comparison_df) == "Count.x"] <- "Count_control"
names(comparison_df)[names(comparison_df) == "Count.y"] <- "Count_case"

# Add number of people per group for totals not in counts
comparison_df$Not_Count_control <- sum(cohort$arrest_ontology == "control") - comparison_df$Count_control
comparison_df$Not_Count_case <- sum(cohort$arrest_ontology %in% c("case")) - comparison_df$Count_case

# Function to perform Fisher's Exact Test for a row
perform_fisher_test <- function(Count_control, not_Count_control, count_case, not_count_case) {
  contingency_table <- matrix(c(Count_control, not_Count_control, count_case, not_count_case), 
                              nrow = 2, 
                              dimnames = list(c("In Count", "Not in Count"), c("Group_1", "Group_case")))
  
  fisher_result <- fisher.test(contingency_table)
  
  return(fisher_result$p.value)
}

# Apply the Fisher function to each row in the dataframe
comparison_df$p_value <- mapply(perform_fisher_test, 
                                comparison_df$Count_control, 
                                comparison_df$Not_Count_control, 
                                comparison_df$Count_case, 
                                comparison_df$Not_Count_case)



comparison_df$adjusted_p_value <- p.adjust(comparison_df$p_value, method = "bonferroni", n = length(comparison_df$p_value))

# Filter significant results based on adjusted p-values
significant_pairs <- comparison_df %>% filter(adjusted_p_value < 0.05)

# Print significant module pairs
print(significant_pairs)
##               Module1            Module2 Count_control Count_case
## 1                CICR          sarcomere            68        104
## 2                 DGC fate_specification             5         28
## 3                 DGC                ICD            23         51
## 4                 DGC  Membrane_polarity            14         41
## 5                 DGC       Proteostasis             4         24
## 6                 DGC          sarcomere            27         68
## 7  fate_specification       mitochondria            15         37
## 8  fate_specification   nuclear_envelope             2         17
## 9  fate_specification          sarcomere            28         73
## 10                ICD fate_specification            24         57
## 11                ICD  Membrane_polarity            33         68
## 12                ICD       Proteostasis            23         55
## 13                ICD          sarcomere            45         95
## 14  lysosomal_storage          sarcomere            25         61
## 15  Membrane_polarity fate_specification            15         46
## 16  Membrane_polarity  lysosomal_storage            12         35
## 17  Membrane_polarity       mitochondria            24         50
## 18  Membrane_polarity       Proteostasis            14         43
## 19  Membrane_polarity          sarcomere            37         83
## 20       mitochondria          sarcomere            37         77
## 21       Proteostasis fate_specification             5         31
## 22       Proteostasis          sarcomere            26         72
## 23          Rasopathy          sarcomere            92        127
##    Not_Count_control Not_Count_case      p_value adjusted_p_value
## 1                528            419 9.184664e-05     8.358044e-03
## 2                591            495 7.980239e-06     7.262018e-04
## 3                573            472 9.256654e-05     8.423555e-03
## 4                582            482 2.248122e-05     2.045791e-03
## 5                592            499 2.376165e-05     2.162310e-03
## 6                569            455 4.739262e-07     4.312729e-05
## 7                581            486 3.201544e-04     2.913405e-02
## 8                594            506 2.162122e-04     1.967531e-02
## 9                568            450 6.060785e-08     5.515314e-06
## 10               572            466 1.366726e-05     1.243721e-03
## 11               563            455 1.500246e-05     1.365223e-03
## 12               573            468 1.660428e-05     1.510989e-03
## 13               551            428 1.046357e-07     9.521851e-06
## 14               571            462 2.895091e-06     2.634532e-04
## 15               581            477 4.241213e-06     3.859504e-04
## 16               584            488 1.330254e-04     1.210531e-02
## 17               572            473 2.536897e-04     2.308576e-02
## 18               582            480 8.210955e-06     7.471969e-04
## 19               559            440 1.988805e-07     1.809813e-05
## 20               559            446 2.651718e-06     2.413064e-04
## 21               591            492 1.281642e-06     1.166294e-04
## 22               570            451 3.284151e-08     2.988577e-06
## 23               504            396 2.121104e-04     1.930205e-02

Make the plot for module makeup

#merge the high effect count data 
High_data <- rbind(data_control, data_case)


#assign in the module gene count data
df_gene_counts <- data.frame(
  Module = c("CICR", "DGC", "ICD", "Membrane_polarity", "Proteostasis", 
             "Rasopathy", "SNS_PNS", "Z_disc", "cytokine", 
             "fate_specification", "lysosomal_storage", "mitochondria", 
             "nuclear_envelope", "sarcomere"),
  Num_Genes = c(11, 7, 12, 24, 7, 17, 10, 12, 4, 11, 3, 25, 4, 16)
)

# scale the high effect date per module based on the genes in the module
High_data_scaled <- High_data %>%
  left_join(df_gene_counts, by = "Module")

High_data_scaled <- High_data_scaled %>%
  mutate(High_count_per_gene = High_count / Num_Genes)

#compute the means
averages_sem_scaled <- High_data_scaled %>%
  dplyr::group_by(Module, Category) %>%
  dplyr::summarize(
    Mean = mean(High_count_per_gene, na.rm = TRUE),
    SD = sd(High_count_per_gene, na.rm = TRUE),
    N = n(),
    SEM = SD / sqrt(N),
    .groups = 'drop'
  )

#run the t-tests to compare the modules per case or control (Category)
test_results <- High_data_scaled %>%
  group_by(Module) %>%
  do({
    ttest <- t.test(High_count_per_gene ~ Category, data = .)
    tidy(ttest)
  }) %>%
  ungroup()  

# show t-test data
print(test_results)
## # A tibble: 14 × 11
##    Module      estimate estimate1 estimate2 statistic p.value parameter conf.low
##    <chr>          <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>
##  1 CICR         6.86e-3   0.0185   0.0117       1.83  6.77e-2      712. -5.01e-4
##  2 DGC          3.59e-3   0.00439  0.000798     2.49  1.31e-2      637.  7.55e-4
##  3 ICD          7.94e-3   0.0135   0.00559      2.34  1.96e-2      702.  1.27e-3
##  4 Membrane_p…  6.76e-3   0.00777  0.00101      3.95  8.96e-5      483.  3.40e-3
##  5 Proteostas…  4.79e-3   0.00533  0.000532     3.01  2.75e-3      509.  1.66e-3
##  6 Rasopathy    3.10e-3   0.0111   0.00800      2.06  3.93e-2      897.  1.52e-4
##  7 SNS_PNS      2.33e-3   0.00696  0.00463      1.04  3.00e-1      621. -2.07e-3
##  8 Z_disc      -1.02e-2   0.0478   0.0580      -1.39  1.65e-1      783. -2.45e-2
##  9 cytokine    -2.88e-2   0.0866   0.115       -3.09  2.06e-3      984. -4.71e-2
## 10 fate_speci…  6.87e-3   0.00738  0.000508     3.41  7.18e-4      475.  2.90e-3
## 11 lysosomal_…  2.17e-4   0.00236  0.00214      0.118 9.06e-1      871. -3.38e-3
## 12 mitochondr…  1.84e-3   0.00281  0.000968     2.37  1.83e-2      596.  3.12e-4
## 13 nuclear_en…  3.13e-3   0.00313  0            2.01  4.53e-2      318   6.56e-5
## 14 sarcomere    1.99e-2   0.0230   0.00314      4.59  5.60e-6      475.  1.14e-2
## # ℹ 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
#calculate averages
averages_sem_scaled <- High_data_scaled %>%
  dplyr::group_by(Module, Category) %>%
  dplyr::summarize(
    Mean = mean(High_count_per_gene),
    SD = sd(High_count_per_gene),
    N = n(),
    SEM = SD / sqrt(N),
    .groups = 'drop'
  )


# Define fill colors and labels for the updated categories
custom_fill_colors <- c("control" = "#FF6464", "case" = "#05618F")
custom_fill_labels <- c("control" = "Control", "case" = "Case")

# Plot
High_modules_plot_scaled <- ggplot(averages_sem_scaled, aes(x = Module, y = Mean, fill = Category, group = Category)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8), color = "black") +
  geom_errorbar(aes(ymin = Mean - SEM, ymax = Mean + SEM), 
                width = 0.2, position = position_dodge(0.8)) +
  scale_fill_manual(values = custom_fill_colors, labels = custom_fill_labels) +
  labs(x = "Module", y = "Average High Count Per Gene", fill = "Category") +
  theme_cowplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
print(High_modules_plot_scaled)

Plot the interaction network

#compute the relative counts based on cohort size
comparison_df <- comparison_df %>%
  mutate(Relative_Counts = (Count_case / (sum(cohort$arrest_ontology %in% c("case"))))/(Count_control / (sum(cohort$arrest_ontology == "control"))))

#assign in the module gene count data
df_gene_counts <- data.frame(
  Module = c("CICR", "DGC", "ICD", "Membrane_polarity", "Proteostasis", 
             "Rasopathy", "SNS_PNS", "Z_disc", "cytokine", 
             "fate_specification", "lysosomal_storage", "mitochondria", 
             "nuclear_envelope", "sarcomere"),
  Num_Genes = c(11, 7, 12, 24, 7, 17, 10, 12, 4, 11, 3, 25, 4, 16)
)
significant_edges_df <- comparison_df %>%
  filter(adjusted_p_value < 0.05)

# Create an igraph object with edge attributes using filtered dataframe
network_graph <- graph_from_data_frame(d = significant_edges_df[, c("Module1", "Module2")], directed = FALSE)

# Matching Num_Genes from df_gene_counts to vertices in the network_graph
V(network_graph)$Num_Genes <- df_gene_counts$Num_Genes[match(V(network_graph)$name, df_gene_counts$Module)]

# Add edge attributes for adjusted_p_value and Relative_Counts from the filtered data frame
E(network_graph)$adjusted_p_value <- significant_edges_df$adjusted_p_value
E(network_graph)$Relative_Counts <- significant_edges_df$Relative_Counts


# Plot the graph with ggraph
HIGH_network <- ggraph(network_graph, layout = 'fr') +
  geom_edge_link(aes(color = -log10(adjusted_p_value), edge_width = Relative_Counts), alpha = 0.8) +
  geom_node_point(aes(size = Num_Genes), color = "Black") +
  geom_node_text(aes(label = name), repel = TRUE, point.padding = unit(0.01, "lines")) +
  scale_size_continuous(range = c(3, 10)) +
  theme_graph()

print(HIGH_network)
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Make the box plot for the module abundance

# Filter NAs and keep only rows with defined Modules
module_data <- module_data %>%
  filter(!is.na(Module))

# Standardize Category labels
module_data <- module_data %>%
  mutate(Category = ifelse(Category == "control", "Control", "Case"))

# Count up the HIGH variants
module_summary <- module_data %>%
  group_by(Module, Category) %>%
  summarise(Total_HIGH_variant = sum(HIGH, na.rm = TRUE), .groups = "drop")

# Count number of genes per module
genes_per_module <- modules %>%
  group_by(Module) %>%
  summarise(Num_Genes = n(), .groups = "drop")

# Merge gene counts into summary
module_summary <- module_summary %>%
  left_join(genes_per_module, by = "Module")

# Calculate HIGHs relative to module size
module_summary <- module_summary %>%
  mutate(Relative_Size = Total_HIGH_variant / Num_Genes)

# Filter to only "Case" category
module_summary_filtered <- module_summary %>%
  filter(Category == "Case")

# Order modules by mean variant count
module_order <- module_summary_filtered %>%
  group_by(Module) %>%
  summarise(Sort_Metric = mean(Total_HIGH_variant, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(Sort_Metric)) %>%
  pull(Module)

# Apply module order
module_summary_filtered$Module <- factor(module_summary_filtered$Module, levels = module_order)

# Plot
modules_HIGH_plot <- ggplot(module_summary_filtered, aes(x = Module, y = Category, size = Total_HIGH_variant, color = Relative_Size)) +
  geom_point(shape = 15, alpha = 1) +
  scale_size(range = c(3, 20), name = "Number of HIGHs") + 
  scale_color_gradient(low = "Grey", high = "#05618F", name = "# of HIGHs relative to Module size") +
  theme_minimal() +
  labs(title = "Total HIGH by Module (Cases)",
       x = "Module",
       y = "") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(modules_HIGH_plot)

Pull in uniprot data

# pull down data for MYH7 (uniprot ID P12883)
MYH7_data <- drawProteins::get_features("P12883")
## [1] "Download has worked"
MYH7_data <- drawProteins::feature_to_dataframe(MYH7_data)

# pull down data for MYBPC3 (uniprot ID Q14896)
MYBPC3_data <- drawProteins::get_features("Q14896")
## [1] "Download has worked"
MYBPC3_data <- drawProteins::feature_to_dataframe(MYBPC3_data)

# pull down data for SCN5A (uniprot ID Q14524)
SCN5A_data <- drawProteins::get_features("Q14524")
## [1] "Download has worked"
SCN5A_data <- drawProteins::feature_to_dataframe(SCN5A_data)

Plot PLP variants

MYH7_plot <- draw_canvas(MYH7_data)
MYH7_plot <- draw_chains(MYH7_plot, MYH7_data)
MYH7_plot <- draw_domains(MYH7_plot, MYH7_data)
MYH7_plot <- draw_regions(MYH7_plot, MYH7_data)
#MYH7_plot <- draw_phospho(MYH7_plot, MYH7_data)

variants <- data.frame(
  begin = c(1712,1057,908,904,869,741,723,576,369,355),
  y = rep(1, 10)  
)


# Now, add these points to your plot
MYH7_plot <- MYH7_plot + 
  geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)

# Adjust themes and aesthetics as needed
MYH7_plot <- MYH7_plot + theme_bw(base_size = 14) + 
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(), 
        axis.text.y = element_blank(),
        panel.border = element_blank())

MYH7_plot

##################################################################################################
MYBPC3_plot <- draw_canvas(MYBPC3_data)
MYBPC3_plot <- draw_chains(MYBPC3_plot, MYBPC3_data)
MYBPC3_plot <- draw_domains(MYBPC3_plot, MYBPC3_data)
MYBPC3_plot <- draw_regions(MYBPC3_plot, MYBPC3_data)
#MYBPC3_plot <- draw_phospho(MYBPC3_plot, MYBPC3_data)

#######Note, 7 variants are not shown becasue they are splice variants

variants <- data.frame(
  begin = c(1098, 1096, 1064, 791, 542, 502, 495, 258, 219),
  y = rep(1, 9)  
)

# Now, add these points to your plot
MYBPC3_plot <- MYBPC3_plot + 
  geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)

# Adjust themes and aesthetics as needed
MYBPC3_plot <- MYBPC3_plot + theme_bw(base_size = 14) + 
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(), 
        axis.text.y = element_blank(),
        panel.border = element_blank())

MYBPC3_plot

##################################################################################################
SCN5A_plot <- draw_canvas(SCN5A_data)
SCN5A_plot <- draw_chains(SCN5A_plot, SCN5A_data)
SCN5A_plot <- draw_domains(SCN5A_plot, SCN5A_data)
SCN5A_plot <- draw_regions(SCN5A_plot, SCN5A_data)
#SCN5A_plot <- draw_phospho(SCN5A_plot, SCN5A_data)

#######Note, 2 variants are not shown becasue they are splice variants
variants <- data.frame(
  begin = c(1784,1595,1319,1316,845,828,814),
  y = rep(1, 7)  
)


# Now, add these points to your plot
SCN5A_plot <- SCN5A_plot + 
  geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)

# Adjust themes as needed
SCN5A_plot <- SCN5A_plot + theme_bw(base_size = 14) + 
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(), 
        axis.text.y = element_blank(),
        panel.border = element_blank())


SCN5A_plot

Nonoding******************************************************************************************************************

Pull in the file that is just the ATAC data intervals overlaid on top of the Hi-C intervals which map to the genes of interest

# Read the PC-Hi-C ATAC overlay BED file
bed_data <- read_tsv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/SFRN_cohort_data/noncoding/noncoding_rare/EP_ATAC_intersect_final_output_nodups.bed", col_names = FALSE, col_types = cols(
  X1 = col_character(), # Chromosome
  X2 = col_double(),    # Start Position
  X3 = col_double()     # End Position
))

Plot noncoding interval size histogram

# Calculate Interval Sizes
bed_data$interval_size <- bed_data$X3 - bed_data$X2

# Calculate the median of interval sizes
median_interval_size <- median(bed_data$interval_size, na.rm = TRUE)

# plot
interval_size_histogram <- ggplot(bed_data, aes(x = interval_size)) +
  geom_histogram(binwidth = 100, fill = "lightblue", color = "black") +
  geom_vline(aes(xintercept = median_interval_size), color = "black", linetype = "dashed", size = 1) +
  xlab("Interval Size") +
  ylab("Frequency") +
  ggtitle("Histogram of Interval Sizes") +
  theme_cowplot(12)

interval_size_histogram

Report the central tendancy

mean(bed_data$interval_size)
## [1] 497.4085
median(bed_data$interval_size)
## [1] 396

Now pull in the ATAC data that maps to genes of interest, including the promoter fragment the ATAC data mapped to

# Read the BED file
bed_data <- read_tsv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/EP_ATAC_intersect_final_output_annotated_unique.bed", col_names = c("Chromosome", "Start", "End", "Chromosome2", "GeneStart", "GeneEnd", "GeneNames"), col_types = cols(
  Chromosome = col_character(),
  Start = col_double(),
  End = col_double(),
  Chromosome2 = col_character(),
  GeneStart = col_double(),
  GeneEnd = col_double(),
  GeneNames = col_character()
))

# Split the gene names in case there are multiple genes separated by semicolon
bed_data <- bed_data %>% 
  mutate(GeneNames = strsplit(GeneNames, ";")) %>%
  unnest(GeneNames)

# Read the gene list
gene_list <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/panel_genes_simple.txt")

# Filter bed_data to keep only rows with genes in the gene list
bed_data <- bed_data %>%
  filter(GeneNames %in% gene_list)

# Count the number of regions mapped to each gene
gene_count <- bed_data %>%
  count(GeneNames)

print(gene_count)
## # A tibble: 347 × 2
##    GeneNames     n
##    <chr>     <int>
##  1 A2ML1        15
##  2 ABAT         45
##  3 ABCC9        35
##  4 ACADVL        3
##  5 ACTC1         9
##  6 ACTL6B       13
##  7 ACTN2        21
##  8 ADAM22        3
##  9 ADSL         35
## 10 AGL          59
## # ℹ 337 more rows

Plot histogram that is number of regions that map to the genes of interest

# Calculate the median
median_regions <- median(gene_count$n, na.rm = TRUE)

# plot
regions_per_gene <- ggplot(gene_count, aes(x = n)) +
  geom_histogram(binwidth = 1, fill = "#B40F20", color = "#B40F20") +
  geom_vline(aes(xintercept = median_regions), color = "black", linetype = "dashed", size = 1) +
  xlab("Number of Regions") +
  ylab("Frequency") +
  theme_cowplot(12) +
  ggtitle("Histogram of the Distribution of Number of Regions per Gene")

regions_per_gene

Report the number of genes per region

mean(gene_count$n)
## [1] 31.40922
median(gene_count$n)
## [1] 24

Report the region sizes

bed_data$region_size <- bed_data$End - bed_data$Start

# Sum region sizes for each gene
total_region_size_per_gene <- bed_data %>%
  group_by(GeneNames) %>%
  summarise(TotalRegionSize = sum(region_size))

mean(total_region_size_per_gene$TotalRegionSize)
## [1] 16239.12
median(total_region_size_per_gene$TotalRegionSize)
## [1] 12010

Plot total sequence space per gene

# Create histogram of Total Region Sizes
ggplot(total_region_size_per_gene, aes(x = TotalRegionSize)) +
  geom_histogram(binwidth = 1000, fill = "blue", color = "black") +
  xlab("Total Region Size") +
  ylab("Frequency") +
  ggtitle("Histogram of Total Region Sizes per Gene")

Pull information about the TSS for each gene

gene_list <- unique(bed_data$GeneNames)

# Select hg19 from Ensembl 
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl", host = "https://grch37.ensembl.org")

# Get TSS positions along with strand information
tss_info <- getBM(attributes = c('hgnc_symbol', 'chromosome_name', 'transcription_start_site', 'strand'),
                  filters = 'hgnc_symbol',
                  values = gene_list,
                  mart = ensembl)

find TSS distance

# Make function to select the most upstream TSS based on strand
select_upstream_tss <- function(df) {
  if (all(df$strand == 1)) {
    return(data.frame(transcription_start_site = min(df$transcription_start_site)))  # Forward strand
  } else {
    return(data.frame(transcription_start_site = max(df$transcription_start_site)))  # Reverse strand
  }
}

# Apply the function to each group of genes
tss_upstream <- tss_info %>%
  group_by(hgnc_symbol) %>%
  do(select_upstream_tss(.))


# Merge bed_data with tss_upstream on gene names
merged_data <- merge(bed_data, tss_upstream, by.x = "GeneNames", by.y = "hgnc_symbol")

# Calculate the midpoint (peak center) of each interval
merged_data$Center <- (merged_data$Start + merged_data$End) / 2

# Calculate the distance
merged_data$DistanceToTSS <- abs(merged_data$Center - merged_data$transcription_start_site)

Distance to TSS

mean(merged_data$DistanceToTSS)
## [1] 264213.1
median(merged_data$DistanceToTSS)
## [1] 176294

Plot distance to TSS histogram

# Create a histogram
median_distance <- median(merged_data$DistanceToTSS, na.rm = TRUE)

dist_tss <- ggplot(merged_data, aes(x = DistanceToTSS)) +
  geom_histogram(binwidth = 0.1, fill = "#FFB3BA", color = "black") +
  geom_vline(aes(xintercept = median_distance), color = "black", linetype = "dashed", size = 1) +
  xlab("Distance to TSS (bp)") +
  ylab("Frequency") +
  theme_cowplot(12) +
  ggtitle("Histogram of Distances to Transcription Start Sites (TSS)") +
  scale_x_log10(breaks = 10^(0:6) * 10000, labels = scales::comma)

dist_tss

Pull all TSSs

# Retrieve TSS information for all genes along with HGNC symbols
all_tss_info <- getBM(
  attributes = c('hgnc_symbol', 'chromosome_name', 'transcription_start_site', 'strand'),
  mart = ensembl
)

# Filter out rows where hgnc_symbol is blank
filtered_all_tss_info <- all_tss_info %>%
  filter(hgnc_symbol != "")

See if the peak is closer to another TSS than the contacted one!

# Function to select the most upstream TSS along with chromosome information based on strand, just like before, but for all genes
select_upstream_tss <- function(df) {
  if (nrow(df) == 0) {
    return(data.frame(chromosome_name = NA, transcription_start_site = NA)) # Return NA if there are no rows
  }
  if (all(df$strand == 1)) {
    # Forward strand
    tss <- min(df$transcription_start_site, na.rm = TRUE)
  } else {
    # Reverse strand
    tss <- max(df$transcription_start_site, na.rm = TRUE)
  }
  chromosome <- df$chromosome_name[which.min(abs(df$transcription_start_site - tss))]
  return(data.frame(chromosome_name = chromosome, transcription_start_site = tss))
}

# Apply the function to each group of genes
all_tss_upstream <- filtered_all_tss_info %>%
  group_by(hgnc_symbol) %>%
  do(select_upstream_tss(.)) %>%
  ungroup() 

# Prepend 'chr' to chromosome names
all_tss_upstream$chromosome_name <- paste0("chr", all_tss_upstream$chromosome_name)

Find the closest transcription start site (TSS) to each peak center and calculate the distance.

# Create GRanges object for all_tss_upstream
tss_ranges <- GRanges(
  seqnames = all_tss_upstream$chromosome_name,
  ranges = IRanges(start = all_tss_upstream$transcription_start_site, end = all_tss_upstream$transcription_start_site)
)

# Create GRanges object for merged_data
merged_data_ranges <- GRanges(
  seqnames = merged_data$Chromosome,
  ranges = IRanges(start = merged_data$Center, end = merged_data$Center)
)

# Find the nearest TSS for each midpoint (peak center)
nearest_hits <- nearest(merged_data_ranges, tss_ranges)

# Extract the closest TSS information
closest_tss_info <- all_tss_upstream[nearest_hits, ]

# Add closest TSS information to merged_data
merged_data$closest_gene <- closest_tss_info$hgnc_symbol
merged_data$closest_tss <- closest_tss_info$transcription_start_site
merged_data$closest_tss_chromosome <- closest_tss_info$chromosome_name

# Calculate distance to closest TSS
merged_data$distance_to_closest_tss <- abs(merged_data$Center - merged_data$closest_tss)

Check if the closest TSS gene matches the originally contacted gene

# Compare GeneNames with closest_gene and append
merged_data$gene_match <- merged_data$GeneNames == merged_data$closest_gene

Compute and print the percentage of matches and mismatches between the contacted gene and the closest TSS gene.

# Calculate the fraction of matches
fraction_match <- sum(merged_data$gene_match, na.rm = TRUE) / nrow(merged_data)

fraction_mismatch <- 1-(sum(merged_data$gene_match, na.rm = TRUE) / nrow(merged_data))

# Print the fraction
print(100*fraction_match)
## [1] 13.26727
print(100*fraction_mismatch)
## [1] 86.73273

Lets see which genes have regions mapped to them that are closer to other TSSs

# Aggregate data by GeneNames
gene_summary <- merged_data %>%
  dplyr::group_by(GeneNames) %>%
  dplyr::summarize(
    any_true = any(gene_match, na.rm = TRUE),
    any_false = any(!gene_match, na.rm = TRUE),
    all_true = all(gene_match, na.rm = TRUE),
    all_false = all(!gene_match, na.rm = TRUE)
  ) %>%
  ungroup()

# Adjusted calculations to include 8 missing genes
adjusted_denominator <- nrow(gene_summary) + 16

percent_any_true <- sum(gene_summary$any_true) / adjusted_denominator * 100
number_any_true <- sum(gene_summary$any_true)
percent_any_false <- sum(gene_summary$any_false) / adjusted_denominator * 100
number_any_false <- sum(gene_summary$any_false) 
percent_all_true <- sum(gene_summary$all_true) / adjusted_denominator * 100
number_all_true <- sum(gene_summary$all_true) 
percent_all_false <- sum(gene_summary$all_false) / adjusted_denominator * 100
number_all_false <- sum(gene_summary$all_false) 
percent_both_true_false <- sum(gene_summary$any_true & gene_summary$any_false) / adjusted_denominator * 100
number_both_true_false <- sum(gene_summary$any_true & gene_summary$any_false) 

# Print the results
cat("Total genes in panel", 363 ,"\n")
## Total genes in panel 363
cat("Genes with no ATAC peaks:", 100*(16/363),"%, ",16,"\n")
## Genes with no ATAC peaks: 4.407713 %,  16
cat("Genes with at least one TRUE:", percent_any_true,"%, ", number_any_true,"\n")
## Genes with at least one TRUE: 42.69972 %,  155
cat("Genes with at least one FALSE:", percent_any_false,"%, ", number_any_false,"\n")
## Genes with at least one FALSE: 93.66391 %,  340
cat("Genes with only TRUE:", percent_all_true,"%, ", number_all_true,"\n")
## Genes with only TRUE: 1.928375 %,  7
cat("Genes with only FALSE:", percent_all_false,"%, ", number_all_false,"\n")
## Genes with only FALSE: 52.89256 %,  192
cat("Genes with both TRUE and FALSE:", percent_both_true_false,"%, ", number_both_true_false,"\n")
## Genes with both TRUE and FALSE: 40.77135 %,  148

input the count data

noncoding_stats <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/noncoding_frequency_by_individual.txt", header = TRUE, sep = "\t")

Categorize the data

categorized_noncoding_stats <- noncoding_stats %>%
  mutate(Category_Group = case_when(
    Category == "control" ~ "Control",
    Category %in% "case" ~ "Case"
  ))

Aggregate and clean

#clean it up to remove inadvertent NAs or infs
categorized_noncoding_stats <- categorized_noncoding_stats %>%
  dplyr::select(-ID, -Category) %>% 
  na.omit() %>% 
  dplyr::filter_all(all_vars(!is.infinite(.))) 

# Aggregate the Data to compute mean and standard deviation for each numeric column
collapsed_noncoding_stats <- categorized_noncoding_stats %>%
  group_by(Category_Group) %>%
  summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE), 
                                           std = ~sd(., na.rm = TRUE))))

# Remove rows with NA values
collapsed_noncoding_stats <- na.omit(collapsed_noncoding_stats)

Perform permutation test

# List of variables to analyze
noncoding_variables_to_analyze <- c("variant_count", "Epilepsy_gnomAD.0.001","Epilepsy_ultrarare","Cardiomyopathy_gnomAD.0.001","Cardiomyopathy_ultrarare")

# Define the permutation function
permutation_test <- function(data, var, group_var, num_permutations = 10000) {
  
  # Calculate the observed difference in means between the two groups
  observed_stat <- abs(mean(data[[var]][data[[group_var]] == "Case"]) - 
                       mean(data[[var]][data[[group_var]] == "Control"]))
  
  # Create an empty vector to store the permutation statistics
  perm_stats <- numeric(num_permutations)
  
  # Perform the permutations
  for (i in 1:num_permutations) {
    # Randomly shuffle the group labels
    permuted_group <- sample(data[[group_var]])
    
    # Calculate the difference in means for the permuted data
    permuted_stat <- abs(mean(data[[var]][permuted_group == "Case"]) - 
                         mean(data[[var]][permuted_group == "Control"]))
    
    # Store the permuted statistic
    perm_stats[i] <- permuted_stat
  }
  
  # Calculate the p-value (proportion of permuted stats >= observed stat)
  p_value <- mean(perm_stats >= observed_stat)
  
  return(list(observed_stat = observed_stat, perm_stats = perm_stats, p_value = p_value))
}

# Initialize a dataframe to store p-values for each variable
p_value_results <- data.frame(Variable = character(), Observed_Stat = numeric(), p_value = numeric(), stringsAsFactors = FALSE)

# Loop through each variable and perform the permutation test
for (var in noncoding_variables_to_analyze) {
  # Run the permutation test
  test_result <- permutation_test(categorized_noncoding_stats, var, "Category_Group")
  
  # Extract the permuted statistics, observed statistic, and p-value
  perm_stats <- test_result$perm_stats
  observed_stat <- test_result$observed_stat
  p_value <- test_result$p_value
  
  # Store the p-value and observed statistic in the dataframe
  p_value_results <- rbind(p_value_results, data.frame(Variable = var, 
                                                       Observed_Stat = observed_stat, 
                                                       p_value = p_value))
  
  # Create a data frame for plotting the permutation distribution
  perm_df <- data.frame(perm_stats = perm_stats)
  
  # Plot the permutation distribution with the observed statistic marked and p-value
  p <- ggplot(perm_df, aes(x = perm_stats)) +
    geom_histogram(bins = 30, fill = "lightblue", color = "black") +
    geom_vline(aes(xintercept = observed_stat), color = "red", linetype = "dashed", size = 1) +
    labs(title = paste("Permutation Test for", var),
         x = "Permuted Statistic",
         y = "Frequency",
         subtitle = paste("Observed Statistic =", round(observed_stat, 4), 
                          "| P-value =", format(p_value, digits = 4))) +  # Add p-value to subtitle
    theme_minimal()
  
  # Print the plot
  print(p)
}

# Print the table of p-values
print("P-value Results for All Variables:")
## [1] "P-value Results for All Variables:"
print(p_value_results)
##                      Variable Observed_Stat p_value
## 1               variant_count    133.718673  0.0000
## 2       Epilepsy_gnomAD.0.001      1.972116  0.0108
## 3          Epilepsy_ultrarare      9.608424  0.0000
## 4 Cardiomyopathy_gnomAD.0.001      5.523130  0.0000
## 5    Cardiomyopathy_ultrarare      7.756665  0.0000

Define a function to compute the ratio of each column’s mean relative to the Control group and calculate confidence intervals

# Function to plot each column as a ratio to the means from Group 1
plot_column_ratio <- function(data, col_name) {
  # Calculate the mean and standard deviation for Group 1
  group1_mean <- mean(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
  group1_sd <- sd(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
  
  # Calculate the ratio for each group relative to Group 1
  data_summary <- data %>%
  group_by(Category_Group) %>%
  summarise(mean_value = mean(.data[[col_name]], na.rm = TRUE),
            sd_value = sd(.data[[col_name]], na.rm = TRUE), 
            n = n()) %>%
  mutate(ratio = mean_value / group1_mean,
         sem_value = sd_value / sqrt(n), #  SEM calculation
         sem_ratio = sem_value / group1_mean, # SEM ratio 
         ci_lower = ratio - (1.96 * sem_ratio), # Lower bound of the CI
         ci_upper = ratio + (1.96 * sem_ratio)) # Upper bound of the CI

  return(list(summary_data = data_summary))
}

Iterate over selected columns, compute summary statistics, and store results in a dataframe.

# List of all columns to plot
columns_to_plot <- setdiff(names(categorized_noncoding_stats), c("ID", "Category", "Category_Group"))


# Initialize an empty dataframe to store summary data
combined_noncoding_stats_summary_df <- data.frame(Category_Group = character(), col_name = character(), mean = 
                                                    numeric(), std = numeric(), stringsAsFactors = FALSE)

# Loop through each column and plot relative to Category_Group1
for (col in columns_to_plot) {
  plot_data <- plot_column_ratio(categorized_noncoding_stats, col)
  # Append summary data to combined_noncoding_stats_summary_df
  combined_noncoding_stats_summary_df <- bind_rows(combined_noncoding_stats_summary_df, mutate(plot_data$summary_data, col_name = col))
}

Filter and factorize selected noncoding variants, then generate a plot showing their mean ratios compared to the Control group

# Subset the data for the specified variables
selected_noncoding_data <- combined_noncoding_stats_summary_df %>%
  filter(col_name %in% c(
                        "Cardiomyopathy_ultrarare",
                       "Epilepsy_ultrarare"),
         Category_Group != "Control") 


# Define the specific order for noncoding variants
levels_order <- c(
                      "Epilepsy_ultrarare",
                      "Cardiomyopathy_ultrarare"
                       )

# Factorize the 'col_name' column with the specified order
selected_noncoding_data$col_name <- factor(selected_noncoding_data$col_name, levels = levels_order)

# Plot 
noncoding_stats_plot <- ggplot(selected_noncoding_data, aes(y = col_name, x = ratio, color = Category_Group)) +
  geom_point(position = position_dodge(width = 1), size = 3) +
  geom_errorbar(aes(xmin = ratio - sem_ratio, xmax = ratio + sem_ratio), position = position_dodge(width = 0.5), width = 0.5) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  scale_color_manual(values = "#FF6464") +
  labs(title = "Ratio Compared to Group 1 +/-SEM",
       y = "Variant",
       x = "Ratio to Group 1 Mean",
       color = "Category Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 8), 
        axis.text.y = element_text(size = 8, hjust = 1),
        axis.title.x = element_text(size = 8), 
        axis.title.y = element_text(size = 8),
        legend.title = element_text(size = 8),
        legend.text = element_text(size = 8),
        plot.title = element_text(size = 8, hjust = 0.5),
        plot.subtitle = element_text(size = 8),
        plot.caption = element_text(size = 8),
        plot.margin = margin(15, 15, 15, 15)) +
  scale_x_continuous(limits = c(0.75, 2))


print(noncoding_stats_plot)

Input the data (change to your path)

# Pull in the gene data
gene_data_noncoding <- read.csv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/individual_noncoding_variants_by_gene.csv")

# Read the cohort data
cohort <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SFRN_cohort.txt", sep = "\t", header = TRUE)

# add the arrest category to each
gene_data_noncoding$Category <- cohort$arrest_ontology[match(gene_data_noncoding$ID, cohort$CGM_id)]

#filter for discovery only
gene_data_noncoding_discovery <- gene_data_noncoding %>%
  filter(Set == "Discovery")

Summarize the data by GENE, counting the number of variants for each gene Filter for Category = case and then group and summarize

variants_per_gene_case <- gene_data_noncoding_discovery %>%
  filter(Category == "case") %>%
  group_by(GENE) %>%
  summarise(
    noncoding_ultrarare = sum(noncoding_ultrarare, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(variants_per_gene_case)
## # A tibble: 424 × 2
##    GENE       noncoding_ultrarare
##    <chr>                    <int>
##  1 A2ML1                       95
##  2 AAMP                        10
##  3 ABAT                        64
##  4 ABCC9                        0
##  5 AC011551.3                   0
##  6 ACADVL                      27
##  7 ACTC1                       31
##  8 ACTL6B                     172
##  9 ACTN2                       28
## 10 ADAM22                       0
## # ℹ 414 more rows

Filter for Category == control and then group and summarize

variants_per_gene_control <- gene_data_noncoding_discovery %>%
  filter(Category == "control") %>%
  group_by(GENE) %>%
  summarise(
    noncoding_ultrarare = sum(noncoding_ultrarare, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(variants_per_gene_control)
## # A tibble: 424 × 2
##    GENE       noncoding_ultrarare
##    <chr>                    <int>
##  1 A2ML1                       37
##  2 AAMP                         1
##  3 ABAT                        50
##  4 ABCC9                        0
##  5 AC011551.3                   0
##  6 ACADVL                       9
##  7 ACTC1                        7
##  8 ACTL6B                     130
##  9 ACTN2                       16
## 10 ADAM22                       0
## # ℹ 414 more rows

Append panel source to the gene_data

# Ensure that the 'GENE' column in 'gene_data_noncoding_discovery' and 'Gene' in 'all_genes' are character type
gene_data_noncoding_discovery$GENE <- as.character(gene_data_noncoding_discovery$GENE)
all_genes$Gene <- as.character(all_genes$Gene)

# Find the index of each gene in 'all_genes'
# Then, use this index to assign the corresponding 'Source' to a new column in 'gene_data'
gene_data_noncoding_discovery$Source <- all_genes$Source[match(gene_data_noncoding_discovery$GENE, all_genes$Gene)]

# Now, 'gene_data' will have a new column 'Source' with the source of each gene
# based on the lookup from 'all_genes'

# View the first few rows to verify the new 'Source' has been added
head(gene_data_noncoding_discovery)
##           ID         GENE noncoding_ultrarare       Set  X X.1 Category
## 1 CGM0000969       RANGRF                   0 Discovery NA  NA     case
## 2 CGM0000969          AGL                   0 Discovery NA  NA     case
## 3 CGM0000969        RNF13                   0 Discovery NA  NA     case
## 4 CGM0000969       PRKAG2                   0 Discovery NA  NA     case
## 5 CGM0000969         JPH2                   1 Discovery NA  NA     case
## 6 CGM0000969 RP11-156P1.2                   0 Discovery NA  NA     case
##           Source
## 1 Cardiomyopathy
## 2 Cardiomyopathy
## 3       Epilepsy
## 4 Cardiomyopathy
## 5 Cardiomyopathy
## 6           <NA>

Add the source to the category 1 variants list

variants_per_gene_control$Source <- all_genes$Source[match(variants_per_gene_control$GENE, all_genes$Gene)]

head(variants_per_gene_control)
## # A tibble: 6 × 3
##   GENE       noncoding_ultrarare Source        
##   <chr>                    <int> <chr>         
## 1 A2ML1                       37 Cardiomyopathy
## 2 AAMP                         1 <NA>          
## 3 ABAT                        50 Epilepsy      
## 4 ABCC9                        0 Cardiomyopathy
## 5 AC011551.3                   0 <NA>          
## 6 ACADVL                       9 Cardiomyopathy

Add the source to the case variants list

variants_per_gene_case$Source <- all_genes$Source[match(variants_per_gene_case$GENE, all_genes$Gene)]

head(variants_per_gene_case)
## # A tibble: 6 × 3
##   GENE       noncoding_ultrarare Source        
##   <chr>                    <int> <chr>         
## 1 A2ML1                       95 Cardiomyopathy
## 2 AAMP                        10 <NA>          
## 3 ABAT                        64 Epilepsy      
## 4 ABCC9                        0 Cardiomyopathy
## 5 AC011551.3                   0 <NA>          
## 6 ACADVL                      27 Cardiomyopathy

combine the variants together now

# Add a new column to indicate the category
variants_per_gene_control <- variants_per_gene_control %>%
  mutate(Category = "control")  

# Add a new column to indicate the category range
variants_per_gene_case <- variants_per_gene_case %>%
  mutate(Category = "case")  

# Combine the two datasets
combined_variants <- bind_rows(variants_per_gene_control, variants_per_gene_case)

# Print the combined dataset
print(combined_variants)
## # A tibble: 848 × 4
##    GENE       noncoding_ultrarare Source         Category
##    <chr>                    <int> <chr>          <chr>   
##  1 A2ML1                       37 Cardiomyopathy control 
##  2 AAMP                         1 <NA>           control 
##  3 ABAT                        50 Epilepsy       control 
##  4 ABCC9                        0 Cardiomyopathy control 
##  5 AC011551.3                   0 <NA>           control 
##  6 ACADVL                       9 Cardiomyopathy control 
##  7 ACTC1                        7 Cardiomyopathy control 
##  8 ACTL6B                     130 Epilepsy       control 
##  9 ACTN2                       16 Cardiomyopathy control 
## 10 ADAM22                       0 Epilepsy       control 
## # ℹ 838 more rows

Plot the number of noncoding variant types for the genes

# Filter dataset where noncoding_ultrarare > 0
noncoding_ultrarare_data <- combined_variants %>%
  filter(noncoding_ultrarare > 0) %>%
  filter(!is.na(Source))


# Label function for noncoding_ultrarare plot
custom_labeller_noncoding <- function(variable, value) {
  if (variable == "Category") {
    return(ifelse(value == "1", "Control", "Case"))
  }
  return(value)
}

# Create the noncoding_ultrarare plot
noncoding_ultrarare_plot <- ggplot(noncoding_ultrarare_data, aes(x = GENE, y = Category, fill = noncoding_ultrarare)) +
  geom_tile() + 
  scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"), 
                       values = scales::rescale(c(0, 0.5, 1))) + 
  facet_wrap(~ Source, scales = "free_x", ncol = 1) + 
  labs(title = "Noncoding Ultrarare Heatmap",
       x = "Gene",
       y = "Category",
       fill = "Count") +
  theme_cowplot(12) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        strip.background = element_blank(), 
        strip.placement = "outside") +
  scale_y_discrete(labels = function(x) custom_labeller_noncoding("Category", x))

# Print the plot
print(noncoding_ultrarare_plot)

Extract pLI constraint scores from gnomAD, compute enrichment stats for noncoding ultrarare variants, and plot their significance relative to pLI

# gnomAD_data pLI constraint data
gnomad_data <- fread("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/gnomad.v2.1.1.lof_metrics.by_gene.txt")

# Function to fetch pLI
get_pLI <- function(gene_symbol, gnomad_data) {
  result <- gnomad_data[gene == gene_symbol, .(gene, pLI)]
  if (nrow(result) == 0) {
    stop("Gene not found in gnomAD data.")
  }
  return(result)
}

# Initialize an empty data frame to store the results
final_merged_data <- data.frame(
  GENE = character(),
  p_value = numeric(),
  pLI = numeric(),
  Source = character(),
  ratio = numeric(),
  stringsAsFactors = FALSE
)

# Process each panel "Source" group separately
for (source in unique(noncoding_ultrarare_data$Source)) {
  
  # Filter data by panel "Source"
  source_data <- noncoding_ultrarare_data %>% filter(Source == source)
  
  # Extract unique genes for this Source
  unique_genes <- unique(source_data$GENE)
  
  # Create a data frame to store pLI values
  pli_values <- data.frame(GENE = character(), pLI = numeric(), stringsAsFactors = FALSE)
  
  # Fetch pLI values for each gene
  for (gene in unique_genes) {
    try({
      pli_value <- get_pLI(gene, gnomad_data)
      pli_values <- rbind(pli_values, data.frame(GENE = gene, pLI = pli_value$pLI, stringsAsFactors = FALSE))
    }, silent = TRUE)
  }
  
  # Compute p-values and ratios for each gene 
  p_values <- source_data %>%
    group_by(GENE) %>%
    summarise(
      p_value = {
        control_count <- sum(noncoding_ultrarare[Category == "control"], na.rm = TRUE)
        case_count   <- sum(noncoding_ultrarare[Category == "case"], na.rm = TRUE)
        total_count  <- sum(noncoding_ultrarare, na.rm = TRUE)
        expected_control <- total_count * (537 / (537 + 506))
        expected_case    <- total_count * (506 / (537 + 506))
        chisq_stat <- ((control_count - expected_control)^2 / expected_control) + 
                      ((case_count - expected_case)^2 / expected_case)
        pchisq(chisq_stat, df = 1, lower.tail = FALSE)
      },
      ratio = {
        control_count <- sum(noncoding_ultrarare[Category == "control"], na.rm = TRUE)
        case_count    <- sum(noncoding_ultrarare[Category == "case"], na.rm = TRUE)
        if (is.na(control_count) | is.na(case_count) | control_count == 0) {
          NA
        } else {
          (case_count / 506) / (control_count / 537)
        }
      },
      .groups = "drop"
    )
  
  # Merge with pLI values
  merged_data <- merge(p_values, pli_values, by = "GENE")
  merged_data$Source <- source
  
  # Append to the final data frame
  final_merged_data <- rbind(final_merged_data, merged_data)
}

# Bonferroni correction
num_tests <- nrow(final_merged_data)
bonferroni_cutoff <- 0.05 / num_tests

# Filter for significant results
bonferroni_filtered_data <- final_merged_data %>%
  filter(p_value < bonferroni_cutoff)

# Get top 12 genes by p-value per source
top10_labels <- bonferroni_filtered_data %>%
  group_by(Source) %>%
  slice_min(order_by = p_value, n = 12) %>%
  ungroup()

# Filter for Cardiomyopathy genes
cardio_data <- bonferroni_filtered_data %>%
  filter(Source == "Cardiomyopathy")

# Plot
reg_variant_plot <- ggplot(cardio_data, aes(y = -log10(p_value), x = pLI, color = ratio)) +
  geom_point() +
  geom_text(
    data = top10_labels %>% filter(Source == "Cardiomyopathy"),
    aes(label = GENE),
    hjust = 0.5, vjust = -0.5
  ) +
  scale_color_gradient(low = "#3C8C78", high = "#dcb43c", na.value = "grey") +
  labs(
    title = paste(
      "P-Values of Noncoding Ultrarare Variants vs. pLI by Source (Bonferroni, p <",
      format(bonferroni_cutoff, digits = 2), ")"
    ),
    x = "pLI",
    y = "-log10(p-value)",
    color = "Ratio\n(Case / Control)"
  ) +
  theme_cowplot(12)

# Print the plot
print(reg_variant_plot)

This is added from HOMER results

motif_data <- data.frame(
  TranscriptionFactor = c("IRF4", "NFY", "NKX2.5", "PRDM1", "SMAD2", "ESRRA", "NFKB","ASCL1"),
  PValue = c(1.00E-23, 1.00E-19, 1.00E-18, 1.00E-17, 1.00E-17, 1.00E-17, 1.00E-15, 1.00E-13)
)

# Create the plot
motif_plot <- ggplot(motif_data, aes(y = TranscriptionFactor, x = -log10(PValue))) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Transcription Factor P-values",
       y = "Transcription Factor",
       x = "-log10(P-value)") +
    theme_cowplot(12)

motif_plot

Combined****************************************************************************************************************** Read the cohort data

# Read the cohort data
total_data <- read.table('/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/combined_data.txt', header = TRUE, sep = '\t', stringsAsFactors = FALSE)


combined_data <- total_data %>% filter(Set =="Discovery") 

replication_data <- total_data %>%  filter(Set =="Replication") 

Categorize the Data based on original categories and arrest status

#Clump by category
categorized_combined_data <- combined_data %>%
  mutate(Category = case_when(
    arrest_group == "control" ~ "Control",
    arrest_group == "case" ~ "zCase"
  )) %>%
  filter(!is.na(Category))

#Convert Category to a factor
categorized_combined_data$Category <- as.factor(categorized_combined_data$Category)

############################ ############## ############## replication
#Clump replication by category z used for formatting reasons I am too lazy to fix
categorized_replication_data <- replication_data %>%
  mutate(Category = case_when(
    arrest_group == "control" ~ "Control",
    arrest_group %in% "case" ~ "zCase"
  )) %>%
  filter(!is.na(Category))

# Convert Category to a factor
categorized_replication_data$Category <- as.factor(categorized_replication_data$Category)

Collapse the rare vars

categorized_combined_data <- categorized_combined_data %>%
  mutate(Epilepsy_rare = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
  Epilepsy_ultrarare = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton))

categorized_combined_data <- categorized_combined_data %>%
  mutate(Epilepsy_null = (Epilepsy_start_lost + Epilepsy_stop_gained))

categorized_combined_data <- categorized_combined_data %>%
  mutate(Cardiomyopathy_rare = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
  Cardiomyopathy_ultrarare = (Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton))

categorized_combined_data <- categorized_combined_data %>%
  mutate(Cardiomyopathy_null = (Cardiomyopathy_start_lost + Cardiomyopathy_stop_gained))

categorized_combined_data <- categorized_combined_data %>% 
  mutate(Epilepsy_noncoding_rare = (Epilepsy_noncoding_gnomad_0.001 + Epilepsy_noncoding_cohort_0.01 + Epilepsy_noncoding_cohort_singleton))
         
categorized_combined_data <- categorized_combined_data %>% 
  mutate(Cardiomyopathy_noncoding_rare = (Cardiomyopathy_noncoding_gnomad_0.001 + Cardiomyopathy_noncoding_cohort_0.01 + Cardiomyopathy_noncoding_cohort_singleton))


############################ ############## ############## replication
categorized_replication_data <- categorized_replication_data %>%
  mutate(Epilepsy_rare = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
  Epilepsy_ultrarare = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Epilepsy_null = (Epilepsy_start_lost + Epilepsy_stop_gained))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Cardiomyopathy_rare = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
  Cardiomyopathy_ultrarare = (Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Cardiomyopathy_null = (Cardiomyopathy_start_lost + Cardiomyopathy_stop_gained))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Epilepsy_noncoding_rare = (Epilepsy_noncoding_gnomad_0.001 + Epilepsy_noncoding_cohort_0.01 + Epilepsy_noncoding_cohort_singleton))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Cardiomyopathy_noncoding_rare = (Cardiomyopathy_noncoding_gnomad_0.001 + Cardiomyopathy_noncoding_cohort_0.01 + Cardiomyopathy_noncoding_cohort_singleton))

Perform multivariate Logistic Regression on everything

model <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare  + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare  + Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null, 
             data = categorized_combined_data, family = binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model
## 
## Call:  glm(formula = Category ~ Normalized_Heart_rate + Normalized_PR_interval + 
##     Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + 
##     Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + 
##     Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + 
##     Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + 
##     Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + 
##     Cardiomyopathy_null + Epilepsy_null, family = binomial(), 
##     data = categorized_combined_data)
## 
## Coefficients:
##                   (Intercept)          Normalized_Heart_rate  
##                      -1.13276                       -0.06633  
##        Normalized_PR_interval                 Normalized_QTc  
##                       0.10545                       -0.06282  
##                 Normalized_BP                 Normalized_QRS  
##                       0.29888                        0.41302  
##               Normalized_LVEF               Normalized_LVESV  
##                      -0.17686                        0.16828  
##                Normalized_SVI                Normalized_Afib  
##                       0.05697                       -0.07965  
##             Normalized_LVEDVI                  Normalized_SV  
##                      -0.09490                       -0.09174  
##                 Epilepsy_rare             Epilepsy_ultrarare  
##                      -0.06083                       -0.01102  
##           Cardiomyopathy_rare       Cardiomyopathy_ultrarare  
##                      -0.03503                       -0.01155  
##                  PLP_Epilepsy             PLP_Cardiomyopathy  
##                      -0.19700                        1.18960  
## Cardiomyopathy_noncoding_rare        Epilepsy_noncoding_rare  
##                      -0.01056                        0.03214  
##           Cardiomyopathy_null                  Epilepsy_null  
##                       0.96647                        0.44278  
## 
## Degrees of Freedom: 992 Total (i.e. Null);  971 Residual
## Null Deviance:       1370 
## Residual Deviance: 1132  AIC: 1176

Compute the scores

# tell it how to make the scores
scores <- predict(model, type = "link")

scores_replication <- predict(model, newdata = categorized_replication_data,  type = "link")

# Add scores to the dataframes
categorized_combined_data$scores <- scores
categorized_replication_data$scores_replication <- scores_replication

Perform multivariate Logistic Regressions based only on the input paramaters and subsets

model_cardiomyopathy_rare <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null, data = categorized_combined_data, family = binomial())

model_epilepsy_rare <- glm(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null, data = categorized_combined_data, family = binomial())

model_noncoding_rare <- glm(Category ~ Epilepsy_noncoding_rare + Cardiomyopathy_noncoding_rare, data = categorized_combined_data, family = binomial())

model_common <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, data = categorized_combined_data, family = binomial())

model_coding_rare <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_null + Epilepsy_null, data = categorized_combined_data, family = binomial())

model_epilepsy_and_common <- glm(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, data = categorized_combined_data, family = binomial())

model_cardiac_and_common <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, data = categorized_combined_data, family = binomial())

model_common_and_coding <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare  + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_null + Epilepsy_null, 
             data = categorized_combined_data, family = binomial())

Compute the rest of the scores and determine rank-based quintiles for all scores

categorized_combined_data$cardiac_variant_score <- predict(model_cardiomyopathy_rare, type = "link")
categorized_combined_data$epilepsy_variant_score <- predict(model_epilepsy_rare, type = "link")
categorized_combined_data$noncoding_variant_score <- predict(model_noncoding_rare, type = "link")
categorized_combined_data$common_variant_score <- predict(model_common, type = "link")
categorized_combined_data$common_and_coding_score <- predict(model_common_and_coding, type = "link")
categorized_combined_data$coding_rare_score <- predict(model_coding_rare, type = "link")
categorized_combined_data$epilepsy_and_common_score <- predict(model_epilepsy_and_common, type = "link")
categorized_combined_data$cardiac_and_common_score <- predict(model_cardiac_and_common, type = "link")


categorized_combined_data <- categorized_combined_data %>%
  mutate(
    rank = rank(scores, ties.method = "first"),
    score_quintiles = ceiling(rank / (n() / 5)),
    rank_cardiac = rank(cardiac_variant_score, ties.method = "first"),
    cardiac_score_quintiles = ceiling(rank_cardiac / (n() / 5)),
    rank_epilepsy = rank(epilepsy_variant_score, ties.method = "first"),
    epilepsy_score_quintiles = ceiling(rank_epilepsy / (n() / 5)),
    rank_noncoding = rank(noncoding_variant_score, ties.method = "first"),
    noncoding_score_quintiles = ceiling(rank_noncoding / (n() / 5)),
    rank_common = rank(common_variant_score, ties.method = "first"),
    common_score_quintiles = ceiling(rank_common / (n() / 5)),
    rank_common_and_coding = rank(common_and_coding_score, ties.method = "first"),
    common_and_coding_score_quintiles = ceiling(rank_common_and_coding / (n() / 5)),
    rank_model_coding_rare = rank(coding_rare_score, ties.method = "first"),
    coding_rare_score_quintiles = ceiling(rank_model_coding_rare / (n() / 5)),
    rank_epilepsy_and_common = rank(epilepsy_and_common_score, ties.method = "first"),
    epilepsy_and_common_score_quintiles = ceiling(rank_epilepsy_and_common / (n() / 5)),
    rank_cardiac_and_common = rank(cardiac_and_common_score, ties.method = "first"),
    cardiac_and_common_score_quintiles = ceiling(rank_cardiac_and_common / (n() / 5))
  )

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    probability = plogis(scores),
    cardiac_probability = plogis(cardiac_variant_score),
    epilepsy_probability = plogis(epilepsy_variant_score),
    noncoding_probability = plogis(noncoding_variant_score),
    common_probability = plogis(common_variant_score),
    common_and_coding_probability = plogis(common_and_coding_score),
    coding_rare_probability = plogis(coding_rare_score),
    epilepsy_and_common_probability = plogis(epilepsy_and_common_score),
    cardiac_and_common_probability = plogis(cardiac_and_common_score),
  )


# Function to calculate odds ratio and CI
calculate_odds_ratios <- function(data, category_column, quintile_column) {
  # Compute counts and initial odds for each quintile
  odds_data <- data %>%
    dplyr::group_by({{ quintile_column }}) %>%
    dplyr::summarise(
      count_category_1 = sum({{ category_column }} == "Control", na.rm = TRUE),
      count_category_case = sum({{ category_column }} == "zCase", na.rm = TRUE),
      .groups = 'drop'
    ) %>%
    dplyr::mutate(
      odds = count_category_case / count_category_1
    )

  # Retrieve the odds of the first quintile as the reference
  first_quintile_odds <- odds_data$odds[1]

  # Calculate the odds ratio relative to the first quintile
  odds_data <- odds_data %>%
    dplyr::mutate(
      odds_ratio = odds / first_quintile_odds,
      se_log_odds_ratio = sqrt(1 / count_category_1 + 1 / count_category_case),
      lower_ci = exp(log(odds_ratio) - 1.96 * se_log_odds_ratio),
      upper_ci = exp(log(odds_ratio) + 1.96 * se_log_odds_ratio)
    )

  return(odds_data)
}

# Apply function to each odds category
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_quintiles)
cardiac_odds_summary = calculate_odds_ratios(categorized_combined_data,Category,  cardiac_score_quintiles)
epilepsy_summary = calculate_odds_ratios(categorized_combined_data,Category,  epilepsy_score_quintiles)
common_summary = calculate_odds_ratios(categorized_combined_data,Category,  common_score_quintiles)
noncoding_summary = calculate_odds_ratios(categorized_combined_data,Category,   noncoding_score_quintiles)
common_and_coding_summary = calculate_odds_ratios(categorized_combined_data,Category,   common_and_coding_score_quintiles)
coding_rare_summary = calculate_odds_ratios(categorized_combined_data,Category,  coding_rare_score_quintiles)
commmon_and_epilepsy_summary = calculate_odds_ratios(categorized_combined_data,Category, epilepsy_and_common_score_quintiles)
cardiac_and_common_summary = calculate_odds_ratios(categorized_combined_data,Category, cardiac_and_common_score_quintiles)


################################################################################################################ replication


categorized_replication_data$cardiac_variant_score_replication <- predict(model_cardiomyopathy_rare, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$epilepsy_variant_score_replication <- predict(model_epilepsy_rare, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$noncoding_variant_score_replication <- predict(model_noncoding_rare, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$common_variant_score_replication <- predict(model_common, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$common_and_coding_score_replication <- predict(model_common_and_coding, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$coding_rare_score_replication <- predict(model_coding_rare, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$epilepsy_and_common_score_replication <- predict(model_epilepsy_and_common, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$cardiac_and_common_score_replication <- predict(model_cardiac_and_common, newdata = categorized_replication_data,  type = "link")


categorized_replication_data <- categorized_replication_data %>%
  mutate(
    rank = rank(scores_replication, ties.method = "first"),
    score_quintiles = ceiling(rank / (n() / 5)),
    rank_cardiac = rank(cardiac_variant_score_replication, ties.method = "first"),
    cardiac_score_quintiles = ceiling(rank_cardiac / (n() / 5)),
    rank_epilepsy = rank(epilepsy_variant_score_replication, ties.method = "first"),
    epilepsy_score_quintiles = ceiling(rank_epilepsy / (n() / 5)),
    rank_noncoding = rank(noncoding_variant_score_replication, ties.method = "first"),
    noncoding_score_quintiles = ceiling(rank_noncoding / (n() / 5)),
    rank_common = rank(common_variant_score_replication, ties.method = "first"),
    common_score_quintiles = ceiling(rank_common / (n() / 5)),
    rank_common_and_coding = rank(common_and_coding_score_replication, ties.method = "first"),
    common_and_coding_score_quintiles = ceiling(rank_common_and_coding / (n() / 5)),
    rank_model_coding_rare = rank(coding_rare_score_replication, ties.method = "first"),
    coding_rare_score_quintiles = ceiling(rank_model_coding_rare / (n() / 5)),
    rank_epilepsy_and_common = rank(epilepsy_and_common_score_replication, ties.method = "first"),
    epilepsy_and_common_score_quintiles = ceiling(rank_epilepsy_and_common / (n() / 5)),
    rank_cardiac_and_common = rank(cardiac_and_common_score_replication, ties.method = "first"),
    cardiac_and_common_score_quintiles = ceiling(rank_cardiac_and_common / (n() / 5))
  )

categorized_replication_data <- categorized_replication_data %>%
  mutate(
    probability = plogis(scores_replication),
    cardiac_probability = plogis(cardiac_variant_score_replication),
    epilepsy_probability = plogis(epilepsy_variant_score_replication),
    noncoding_probability = plogis(noncoding_variant_score_replication),
    common_probability = plogis(common_variant_score_replication),
    common_and_coding_probability = plogis(common_and_coding_score_replication),
    coding_rare_probability = plogis(coding_rare_score_replication),
    epilepsy_and_common_probability = plogis(epilepsy_and_common_score_replication),
    cardiac_and_common_probability = plogis(cardiac_and_common_score_replication),
  )


# Apply function to each odds category
combined_odds_summary_replication = calculate_odds_ratios(categorized_replication_data, Category, score_quintiles)
cardiac_odds_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,  cardiac_score_quintiles)
epilepsy_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,  epilepsy_score_quintiles)
common_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,  common_score_quintiles)
noncoding_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,   noncoding_score_quintiles)
common_and_coding_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,   common_and_coding_score_quintiles)
coding_rare_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,  coding_rare_score_quintiles)
commmon_and_epilepsy_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, epilepsy_and_common_score_quintiles)
cardiac_and_common_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, cardiac_and_common_score_quintiles)

plot

plot(log2(combined_odds_summary$odds_ratio), 
     ylim = c(
         log2(min(c(combined_odds_summary$lower_ci, cardiac_odds_summary$lower_ci, epilepsy_summary$lower_ci, 
                   common_summary$lower_ci, noncoding_summary$lower_ci, common_and_coding_summary$lower_ci, 
                   coding_rare_summary$lower_ci))), 
         log2(max(c(combined_odds_summary$upper_ci, cardiac_odds_summary$upper_ci, epilepsy_summary$upper_ci, 
                   common_summary$upper_ci, noncoding_summary$upper_ci, common_and_coding_summary$upper_ci, 
                   coding_rare_summary$upper_ci)))
     ), 
     pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI", 

     
# Add error bars for 95% CI - Combined
          xaxt = "n", col = "#3C8C78")
lines(1:length(combined_odds_summary$odds_ratio), log2(combined_odds_summary$odds_ratio), col = "#3C8C78")
arrows(x0 = 1:length(combined_odds_summary$odds_ratio), 
       y0 = log2(combined_odds_summary$lower_ci), 
       y1 = log2(combined_odds_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#3C8C78")

# Cardiac
lines(1:length(cardiac_odds_summary$odds_ratio), log2(cardiac_odds_summary$odds_ratio), col = "#2E86C1")
points(log2(cardiac_odds_summary$odds_ratio), pch = 19, col = "#2E86C1")
arrows(x0 = 1:length(cardiac_odds_summary$odds_ratio), 
       y0 = log2(cardiac_odds_summary$lower_ci), 
       y1 = log2(cardiac_odds_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#2E86C1")

# Epilepsy
lines(1:length(epilepsy_summary$odds_ratio), log2(epilepsy_summary$odds_ratio), col = "#A93226")
points(log2(epilepsy_summary$odds_ratio), pch = 19, col = "#A93226")
arrows(x0 = 1:length(epilepsy_summary$odds_ratio), 
       y0 = log2(epilepsy_summary$lower_ci), 
       y1 = log2(epilepsy_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#A93226")

# Common
lines(1:length(common_summary$odds_ratio), log2(common_summary$odds_ratio), col = "#229954")
points(log2(common_summary$odds_ratio), pch = 19, col = "#229954")
arrows(x0 = 1:length(common_summary$odds_ratio), 
       y0 = log2(common_summary$lower_ci), 
       y1 = log2(common_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#229954")

# Noncoding
lines(1:length(noncoding_summary$odds_ratio), log2(noncoding_summary$odds_ratio), col = "#D68910")
points(log2(noncoding_summary$odds_ratio), pch = 19, col = "#D68910")
arrows(x0 = 1:length(noncoding_summary$odds_ratio), 
       y0 = log2(noncoding_summary$lower_ci), 
       y1 = log2(noncoding_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#D68910")

# common and coding summary
lines(1:length(common_and_coding_summary$odds_ratio), log2(common_and_coding_summary$odds_ratio), col = "Black")
points(log2(common_and_coding_summary$odds_ratio), pch = 19, col = "Black")
arrows(x0 = 1:length(common_and_coding_summary$odds_ratio), 
       y0 = log2(common_and_coding_summary$lower_ci), 
       y1 = log2(common_and_coding_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "Black")

# coding rare summary
lines(1:length(coding_rare_summary$odds_ratio), log2(coding_rare_summary$odds_ratio), col = "Pink")
points(log2(coding_rare_summary$odds_ratio), pch = 19, col = "Pink")
arrows(x0 = 1:length(coding_rare_summary$odds_ratio), 
       y0 = log2(coding_rare_summary$lower_ci), 
       y1 = log2(coding_rare_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "Pink")

# Add x-axis labels for quintiles
axis(1, at = 1:length(combined_odds_summary$odds_ratio), labels = TRUE)

# legend 
legend("topleft", legend = c("Combined", "Cardiac", "Epilepsy", "Common", "Noncoding", "common_and_coding", "model_coding_rare"), 
       col = c("#3C8C78", "#2E86C1", "#A93226", "#229954", "#D68910", "Black", "Pink"), pch = 19, cex = 0.8)

plot replication

plot(log2(combined_odds_summary_replication$odds_ratio), 
          ylim = c(-2,10), 
     pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI", 
     
# Add error bars for 95% CI - Combined
          xaxt = "n", col = "#3C8C78")
lines(1:length(combined_odds_summary_replication$odds_ratio), log2(combined_odds_summary_replication$odds_ratio), col = "#3C8C78")
arrows(x0 = 1:length(combined_odds_summary_replication$odds_ratio), 
       y0 = log2(combined_odds_summary_replication$lower_ci), 
       y1 = log2(combined_odds_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#3C8C78")

# Cardiac
lines(1:length(cardiac_odds_summary_replication$odds_ratio), log2(cardiac_odds_summary_replication$odds_ratio), col = "#2E86C1")
points(log2(cardiac_odds_summary_replication$odds_ratio), pch = 19, col = "#2E86C1")
arrows(x0 = 1:length(cardiac_odds_summary_replication$odds_ratio), 
       y0 = log2(cardiac_odds_summary_replication$lower_ci), 
       y1 = log2(cardiac_odds_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#2E86C1")

# Epilepsy
lines(1:length(epilepsy_summary_replication$odds_ratio), log2(epilepsy_summary_replication$odds_ratio), col = "#A93226")
points(log2(epilepsy_summary_replication$odds_ratio), pch = 19, col = "#A93226")
arrows(x0 = 1:length(epilepsy_summary_replication$odds_ratio), 
       y0 = log2(epilepsy_summary_replication$lower_ci), 
       y1 = log2(epilepsy_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#A93226")

# Common
lines(1:length(common_summary_replication$odds_ratio), log2(common_summary_replication$odds_ratio), col = "#229954")
points(log2(common_summary_replication$odds_ratio), pch = 19, col = "#229954")
arrows(x0 = 1:length(common_summary_replication$odds_ratio), 
       y0 = log2(common_summary_replication$lower_ci), 
       y1 = log2(common_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#229954")

# Noncoding
lines(1:length(noncoding_summary_replication$odds_ratio), log2(noncoding_summary_replication$odds_ratio), col = "#D68910")
points(log2(noncoding_summary_replication$odds_ratio), pch = 19, col = "#D68910")
arrows(x0 = 1:length(noncoding_summary_replication$odds_ratio), 
       y0 = log2(noncoding_summary_replication$lower_ci), 
       y1 = log2(noncoding_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#D68910")

# common and coding summary
lines(1:length(common_and_coding_summary_replication$odds_ratio), log2(common_and_coding_summary_replication$odds_ratio), col = "Black")
points(log2(common_and_coding_summary_replication$odds_ratio), pch = 19, col = "Black")
arrows(x0 = 1:length(common_and_coding_summary_replication$odds_ratio), 
       y0 = log2(common_and_coding_summary_replication$lower_ci), 
       y1 = log2(common_and_coding_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "Black")

# coding rare summary
lines(1:length(coding_rare_summary_replication$odds_ratio), log2(coding_rare_summary_replication$odds_ratio), col = "Pink")
points(log2(coding_rare_summary_replication$odds_ratio), pch = 19, col = "Pink")
arrows(x0 = 1:length(coding_rare_summary_replication$odds_ratio), 
       y0 = log2(coding_rare_summary_replication$lower_ci), 
       y1 = log2(coding_rare_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "Pink")

# Add x-axis labels for quintiles
axis(1, at = 1:length(combined_odds_summary_replication$odds_ratio), labels = TRUE)

# legend 
legend("topleft", legend = c("Combined", "Cardiac", "Epilepsy", "Common", "Noncoding", "common_and_coding", "model_coding_rare"), 
       col = c("#3C8C78", "#2E86C1", "#A93226", "#229954", "#D68910", "Black", "Pink"), pch = 19, cex = 0.8)

Z test for odds significance

compare_odds <- function(odds1, odds2) {
  # Calculate the difference in log odds
  log_odds1 <- log(odds1$odds_ratio)
  log_odds2 <- log(odds2$odds_ratio)
  delta_log_odds <- log_odds1 - log_odds2
  
  # Calculate the standard error of the difference
  se1 <- odds1$se_log_odds_ratio
  se2 <- odds2$se_log_odds_ratio
  se_delta_log_odds <- sqrt(se1^2 + se2^2)
  
  # Z-test for each quintile
  z_values <- delta_log_odds / se_delta_log_odds
  p_values <- 2 * pnorm(abs(z_values), lower.tail = FALSE)  
  
  # Return a data frame with the results
  return(data.frame(quintile = 1:length(p_values), p_values = p_values))
}

# Apply the function to compare 'coding_rare_summary', 'common_and_coding_summary', and 'combined_odds_summary'
p_values_coding_vs_common_coding <- compare_odds(coding_rare_summary, common_and_coding_summary)
p_values_coding_vs_model <- compare_odds(coding_rare_summary, combined_odds_summary)
p_values_common_coding_vs_model <- compare_odds(common_and_coding_summary, combined_odds_summary)

# Combine the data frames with an identifier for each comparison
p_values_coding_vs_common_coding$comparison <- "Coding vs Common and Coding"
p_values_coding_vs_model$comparison <- "Coding vs Model"
p_values_common_coding_vs_model$comparison <- "Common and Coding vs Model"

# Bind the rows together
combined_p_values <- bind_rows(p_values_coding_vs_common_coding,
                               p_values_coding_vs_model,
                               p_values_common_coding_vs_model)

# Convert quintile to a factor
combined_p_values$quintile <- factor(combined_p_values$quintile, levels = 1:5)

combined_p_values
##    quintile     p_values                  comparison
## 1         1 1.0000000000 Coding vs Common and Coding
## 2         2 0.0302326527 Coding vs Common and Coding
## 3         3 0.1655321102 Coding vs Common and Coding
## 4         4 0.0016031606 Coding vs Common and Coding
## 5         5 0.0376069426 Coding vs Common and Coding
## 6         1 1.0000000000             Coding vs Model
## 7         2 0.0459511700             Coding vs Model
## 8         3 0.0041855535             Coding vs Model
## 9         4 0.0006605028             Coding vs Model
## 10        5 0.0006065780             Coding vs Model
## 11        1 1.0000000000  Common and Coding vs Model
## 12        2 0.8875234087  Common and Coding vs Model
## 13        3 0.1454995597  Common and Coding vs Model
## 14        4 0.8097807932  Common and Coding vs Model
## 15        5 0.1582202334  Common and Coding vs Model

Calculate the scores per category and plot them

#plot
common_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = common_variant_score, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-2, 2) +  
    theme_cowplot(12)

cardiac_variant_score_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = cardiac_variant_score, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-1, 1) +  
    theme_cowplot(12)

epilepsy_variant_score_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = epilepsy_variant_score, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-1, 1) +  
    theme_cowplot(12)

noncoding_variant_score_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = noncoding_variant_score, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-1, 0.75) +  
    theme_cowplot(12)

scores_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = scores, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-3, 3) +
    theme_cowplot(12)

suppressWarnings(print(common_variant_score_plot))

suppressWarnings(print(cardiac_variant_score_plot))

suppressWarnings(print(epilepsy_variant_score_plot))

suppressWarnings(print(noncoding_variant_score_plot))

suppressWarnings(print(scores_plot))

replication

Now plot the Z normalized replication data

mean_common_replication <- mean(categorized_replication_data$common_variant_score_replication)
sd_common_replication <- sd(categorized_replication_data$common_variant_score_replication)
mean_cardiac_replication <- mean(categorized_replication_data$cardiac_variant_score_replication)
sd_cardiac_replication <- sd(categorized_replication_data$cardiac_variant_score_replication)
mean_epilepsy_replication <- mean(categorized_replication_data$epilepsy_variant_score_replication)
sd_epilepsy_replication <- sd(categorized_replication_data$epilepsy_variant_score_replication)
mean_noncoding_replication <- mean(categorized_replication_data$noncoding_variant_score_replication)
sd_noncoding_replication <- sd(categorized_replication_data$noncoding_variant_score_replication)
mean_scores_replication <- mean(categorized_replication_data$scores_replication)
sd_scores_replication <- sd(categorized_replication_data$scores_replication)


common_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (common_variant_score_replication - mean_common_replication) / sd_common_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-2.5, 2.5) +
    theme_cowplot(12)

cardiac_variant_score_replication_plot <-  ggplot(categorized_replication_data, aes(x = Category, y = (cardiac_variant_score_replication - mean_cardiac_replication)/ sd_cardiac_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
 ylim(-2.5, 2.5) +
  theme_cowplot(12)

epilepsy_variant_score_replication_plot <-  ggplot(categorized_replication_data, aes(x = Category, y = (epilepsy_variant_score_replication - mean_epilepsy_replication)/ sd_epilepsy_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
     ylim(-2.5, 2.5) +
    theme_cowplot(12)

noncoding_variant_score_replication_plot <-  ggplot(categorized_replication_data, aes(x = Category, y = (noncoding_variant_score_replication - mean_noncoding_replication)/ sd_noncoding_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
     ylim(-2.5, 2.5) +
    theme_cowplot(12)

scores_replication_plot <-   ggplot(categorized_replication_data, aes(x = Category, y = (scores_replication - mean_scores_replication)/ sd_scores_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
     ylim(-2.5, 2.5) +
    theme_cowplot(12)

suppressWarnings(print(common_variant_score_replication_plot))

suppressWarnings(print(cardiac_variant_score_replication_plot))

suppressWarnings(print(epilepsy_variant_score_replication_plot))

suppressWarnings(print(noncoding_variant_score_replication_plot))

suppressWarnings(print(scores_replication_plot))

Compute the wilcox.tests

wilcox.test_cardiomyopathy <- wilcox.test(cardiac_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_cardiomyopathy
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  cardiac_variant_score by Category
## W = 76739, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_epilepsy <- wilcox.test(epilepsy_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_epilepsy
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  epilepsy_variant_score by Category
## W = 86944, p-value = 3.215e-15
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_common <- wilcox.test(common_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_common
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  common_variant_score by Category
## W = 77863, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_noncoding <- wilcox.test(noncoding_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_noncoding
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  noncoding_variant_score by Category
## W = 101916, p-value = 5.208e-06
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_combined <- wilcox.test(scores ~ Category, data = categorized_combined_data)
wilcox.test_combined
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  scores by Category
## W = 57835, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

replication

wilcox.test_cardiomyopathy_replication <- wilcox.test(cardiac_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_cardiomyopathy_replication
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  cardiac_variant_score_replication by Category
## W = 850, p-value = 3.419e-08
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_epilepsy_replication <- wilcox.test(epilepsy_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_epilepsy_replication
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  epilepsy_variant_score_replication by Category
## W = 1286, p-value = 0.0006998
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_common_replication <- wilcox.test(common_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_common_replication
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  common_variant_score_replication by Category
## W = 1667, p-value = 0.1309
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_noncoding_replication <- wilcox.test(noncoding_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_noncoding_replication
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  noncoding_variant_score_replication by Category
## W = 523, p-value = 1.213e-12
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_combined_replication <- wilcox.test(scores_replication ~ Category, data = categorized_replication_data)
wilcox.test_combined_replication
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  scores_replication by Category
## W = 498, p-value = 4.973e-13
## alternative hypothesis: true location shift is not equal to 0

Plot the distribution of the categories by score

distribution_score_plot <- ggplot(categorized_combined_data, aes(x = scores, fill = Category)) + 
  geom_density(aes(y = ..density..), alpha = 0.5, adjust = 1) +
  scale_fill_manual(values = group_colors) +
    scale_x_continuous(limits = c(-3, 4)) +
  labs(title = "Normalized Density Plots of Scores by Category",
       x = "Score",
       y = "Density") +
  theme_cowplot(12) 


suppressWarnings(print(distribution_score_plot))

Plot the filled density of the categories by score

density_score_plot <- ggplot(categorized_combined_data, aes(x = scores, fill = Category)) + 
  geom_density(position = "fill", alpha = 0.5) + 
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(limits = c(-3, 4)) +
  labs(title = "Stacked Density of Scores by Category",
       x = "Score",
       y = "Fraction") +
  theme_cowplot() +
  scale_fill_manual(values = group_colors)

# Print the plot
suppressWarnings(print(density_score_plot))

Plot the ROCs

# Convert 'Category' into a binary format: 0 for control, 1 for case
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)

#Full model
roc_result_full <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

paste("Full model AUC:")
## [1] "Full model AUC:"
auc(roc_result_full)
## Area under the curve: 0.7638
#common variants
roc_result_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$common_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

paste("commonvariant model AUC:")
## [1] "commonvariant model AUC:"
auc(roc_result_common)
## Area under the curve: 0.682
#coding variants
roc_result_coding <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$coding_rare_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

paste("coding variant model AUC:")
## [1] "coding variant model AUC:"
auc(roc_result_coding)
## Area under the curve: 0.7008
#cardiac and common variants
roc_result_cardiac_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_and_common_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

paste("coding variant model AUC:")
## [1] "coding variant model AUC:"
auc(roc_result_cardiac_and_common)
## Area under the curve: 0.7364
#common and coding variants
roc_result_common_and_coding <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$common_and_coding_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

paste("common and coding variant model AUC:")
## [1] "common and coding variant model AUC:"
auc(roc_result_common_and_coding)
## Area under the curve: 0.7449

Plot the replication ROC and Precision-recall

# Convert 'Category' into a binary format: 0 for control, 1 for case
categorized_replication_data$binary_outcome <- ifelse(categorized_replication_data$Category == "Control", 0, 1)

roc_result_full_replication <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

auc(roc_result_full_replication)
## Area under the curve: 0.874

Figure out the ROC improvement with each variant class and plot it

#compute the ones not plotted
roc_result_cardiac <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$epilepsy_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$epilepsy_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_cardiac_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calculate AUC for each ROC object
auc_epilepsy <- auc(roc_result_epilepsy)
auc_cardiac <- auc(roc_result_cardiac)
auc_common <- auc(roc_result_common)
auc_epilepsy_and_common <- auc(roc_result_epilepsy_and_common)
auc_cardiac_and_common <- auc(roc_result_cardiac_and_common)
auc_common_and_coding <- auc(roc_result_common_and_coding)
auc_coding <- auc(roc_result_coding)
auc_full <- auc(roc_result_full)

# Base AUC for random chance
auc_random_chance <- 0.5

# Create a data frame for plotting
percentage_changes_df <- data.frame(
  Model = c("Epilepsy","Cardiac","Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
  PercentageChange = c(
    (auc_epilepsy - auc_random_chance) / auc_random_chance * 100,
    (auc_cardiac - auc_random_chance) / auc_random_chance * 100,
    (auc_common - auc_random_chance) / auc_random_chance * 100,
    (auc_epilepsy_and_common - auc_random_chance) / auc_random_chance * 100,
    (auc_cardiac_and_common - auc_random_chance) / auc_random_chance * 100,
    (auc_common_and_coding - auc_random_chance) / auc_random_chance * 100,
    (auc_coding - auc_random_chance) / auc_random_chance * 100,
    (auc_full - auc_random_chance) / auc_random_chance * 100
  )
)


auc_df <- data.frame(
  Model = c("Epilepsy","Cardiac","Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
  auc = c(
    (auc_epilepsy),
    (auc_cardiac),
    (auc_common),
    (auc_epilepsy_and_common),
    (auc_cardiac_and_common),
    (auc_common_and_coding),
    (auc_coding),
    (auc_full)
  )
)

# Set factor levels
percentage_changes_df$Model <- factor(percentage_changes_df$Model, levels = c(
  "Epilepsy","Cardiac","Coding", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Full"
))

# Plot
auc_performance_plot <- ggplot(percentage_changes_df, aes(x = Model, y = PercentageChange, fill = Model)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme_minimal() +
  labs(title = "Percentage Change Over Random Chance AUC",
       y = "Percentage Change (%)",
       x = "") +
  theme_cowplot(12)

print(auc_performance_plot)

# Calculate p-values using DeLong's test
p_values <- list(
  "Epilepsy vs CMAR" = roc.test(roc_result_epilepsy, roc_result_cardiac)$p.value,
  "CMAR vs Common" = roc.test(roc_result_common, roc_result_cardiac)$p.value,
  "Epilepsy vs Common" = roc.test(roc_result_common, roc_result_epilepsy)$p.value,
  "Coding vs CMAR" = roc.test(roc_result_coding, roc_result_cardiac)$p.value,
  "Coding vs Epilepsy" = roc.test(roc_result_coding, roc_result_epilepsy)$p.value,
  "CMAR and Common vs Common" = roc.test(roc_result_cardiac_and_common, roc_result_common)$p.value,
  "Common vs Coding" = roc.test(roc_result_common, roc_result_coding)$p.value,
  "CMAR and common vs Common and Coding" = roc.test(roc_result_cardiac_and_common, roc_result_common_and_coding)$p.value,
  "Common vs Common and Coding" = roc.test(roc_result_common, roc_result_common_and_coding)$p.value,
  "Full vs Common and Coding" = roc.test(roc_result_common_and_coding, roc_result_full)$p.value
)

# Print p-values
print(p_values)
## $`Epilepsy vs CMAR`
## [1] 0.03714183
## 
## $`CMAR vs Common`
## [1] 0.8236934
## 
## $`Epilepsy vs Common`
## [1] 0.05250968
## 
## $`Coding vs CMAR`
## [1] 0.1295995
## 
## $`Coding vs Epilepsy`
## [1] 4.40734e-05
## 
## $`CMAR and Common vs Common`
## [1] 3.718671e-06
## 
## $`Common vs Coding`
## [1] 0.336048
## 
## $`CMAR and common vs Common and Coding`
## [1] 0.06421658
## 
## $`Common vs Common and Coding`
## [1] 2.86077e-07
## 
## $`Full vs Common and Coding`
## [1] 0.004615593

Compute the deviance attributable to each class

# Perform ANOVA on the model
null_model <- glm(Category ~ 1, data = categorized_combined_data, family = binomial())

# Perform ANOVA on the full model
anova_results <- anova(model, test = "Chisq")

# Calculate the null deviance and the full model deviance
null_deviance <- deviance(null_model)
full_deviance <- deviance(model)

# Calculate the total sum of squares (using total deviance)
tss <- null_deviance - full_deviance

# Calculate the proportion of deviance explained by each predictor
anova_results$Proportion_Variance_Explained <- anova_results$`Deviance` / tss

# Display the results
anova_results
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Category
## 
## Terms added sequentially (first to last)
## 
## 
##                               Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL                                            992     1370.0         
## Normalized_Heart_rate          1    8.094       991     1361.9  0.00444
## Normalized_PR_interval         1    6.193       990     1355.7  0.01282
## Normalized_QTc                 1    5.429       989     1350.3  0.01980
## Normalized_BP                  1   21.492       988     1328.8  0.00000
## Normalized_QRS                 1   32.145       987     1296.6  0.00000
## Normalized_LVEF                1   11.611       986     1285.0  0.00066
## Normalized_LVESV               1    2.819       985     1282.2  0.09316
## Normalized_SVI                 1    4.344       984     1277.8  0.03715
## Normalized_Afib                1    0.736       983     1277.1  0.39103
## Normalized_LVEDVI              1    6.693       982     1270.4  0.00968
## Normalized_SV                  1    1.424       981     1269.0  0.23282
## Epilepsy_rare                  1    7.220       980     1261.8  0.00721
## Epilepsy_ultrarare             1   11.748       979     1250.0  0.00061
## Cardiomyopathy_rare            1    6.754       978     1243.3  0.00935
## Cardiomyopathy_ultrarare       1    3.595       977     1239.7  0.05795
## PLP_Epilepsy                   1    0.009       976     1239.7  0.92564
## PLP_Cardiomyopathy             1   46.892       975     1192.8  0.00000
## Cardiomyopathy_noncoding_rare  1    3.390       974     1189.4  0.06561
## Epilepsy_noncoding_rare        1   35.430       973     1154.0  0.00000
## Cardiomyopathy_null            1   16.730       972     1137.2  0.00004
## Epilepsy_null                  1    4.795       971     1132.4  0.02855
##                               Proportion_Variance_Explained
## NULL                                                       
## Normalized_Heart_rate                              0.034075
## Normalized_PR_interval                             0.026072
## Normalized_QTc                                     0.022857
## Normalized_BP                                      0.090478
## Normalized_QRS                                     0.135322
## Normalized_LVEF                                    0.048878
## Normalized_LVESV                                   0.011867
## Normalized_SVI                                     0.018286
## Normalized_Afib                                    0.003097
## Normalized_LVEDVI                                  0.028176
## Normalized_SV                                      0.005993
## Epilepsy_rare                                      0.030394
## Epilepsy_ultrarare                                 0.049455
## Cardiomyopathy_rare                                0.028434
## Cardiomyopathy_ultrarare                           0.015135
## PLP_Epilepsy                                       0.000037
## PLP_Cardiomyopathy                                 0.197408
## Cardiomyopathy_noncoding_rare                      0.014270
## Epilepsy_noncoding_rare                            0.149152
## Cardiomyopathy_null                                0.070428
## Epilepsy_null                                      0.020185

REDO the “improvement” testing in the replication cohort

# Compute ROC curves for all models
roc_result_cardiac <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$cardiac_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$epilepsy_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_common <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy_and_common <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$epilepsy_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_cardiac_and_common <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$cardiac_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_common_and_coding <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$common_and_coding_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_coding <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$coding_rare_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_full <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calculate AUC for each ROC object
auc_epilepsy <- auc(roc_result_epilepsy)
auc_cardiac <- auc(roc_result_cardiac)
auc_common <- auc(roc_result_common)
auc_epilepsy_and_common <- auc(roc_result_epilepsy_and_common)
auc_cardiac_and_common <- auc(roc_result_cardiac_and_common)
auc_common_and_coding <- auc(roc_result_common_and_coding)
auc_coding <- auc(roc_result_coding)
auc_full <- auc(roc_result_full)

# Base AUC for random chance
auc_random_chance <- 0.5

# Create a data frame for plotting
percentage_changes_df <- data.frame(
  Model = c("Epilepsy", "Cardiac", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
  PercentageChange = c(
    (auc_epilepsy - auc_random_chance) / auc_random_chance * 100,
    (auc_cardiac - auc_random_chance) / auc_random_chance * 100,
    (auc_common - auc_random_chance) / auc_random_chance * 100,
    (auc_epilepsy_and_common - auc_random_chance) / auc_random_chance * 100,
    (auc_cardiac_and_common - auc_random_chance) / auc_random_chance * 100,
    (auc_common_and_coding - auc_random_chance) / auc_random_chance * 100,
    (auc_coding - auc_random_chance) / auc_random_chance * 100,
    (auc_full - auc_random_chance) / auc_random_chance * 100
  )
)

auc_df <- data.frame(
  Model = c("Epilepsy", "Cardiac", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
  auc = c(
    auc_epilepsy,
    auc_cardiac,
    auc_common,
    auc_epilepsy_and_common,
    auc_cardiac_and_common,
    auc_common_and_coding,
    auc_coding,
    auc_full
  )
)

# Set factor levels
percentage_changes_df$Model <- factor(percentage_changes_df$Model, levels = c(
  "Epilepsy", "Cardiac", "Coding", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Full"
))

# Plot
auc_performance_plot <- ggplot(percentage_changes_df, aes(x = Model, y = PercentageChange, fill = Model)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme_minimal() +
  labs(title = "Percentage Change Over Random Chance AUC",
       y = "Percentage Change (%)",
       x = "") +
  theme_cowplot(12)

print(auc_performance_plot)

# Calculate p-values using DeLong's test
p_values <- list(
  "Epilepsy vs CMAR" = roc.test(roc_result_epilepsy, roc_result_cardiac)$p.value,
  "CMAR vs Common" = roc.test(roc_result_common, roc_result_cardiac)$p.value,
  "Epilepsy vs Common" = roc.test(roc_result_common, roc_result_epilepsy)$p.value,
  "Coding vs CMAR" = roc.test(roc_result_coding, roc_result_cardiac)$p.value,
  "Coding vs Epilepsy" = roc.test(roc_result_coding, roc_result_epilepsy)$p.value,
  "CMAR and Common vs Common" = roc.test(roc_result_cardiac_and_common, roc_result_common)$p.value,
  "Common vs Coding" = roc.test(roc_result_common, roc_result_coding)$p.value,
  "CMAR and common vs Common and Coding" = roc.test(roc_result_cardiac_and_common, roc_result_common_and_coding)$p.value,
  "Common vs Common and Coding" = roc.test(roc_result_common, roc_result_common_and_coding)$p.value,
  "Full vs CMAR" = roc.test(roc_result_cardiac, roc_result_full)$p.value
)

# Print p-values
print(p_values)
## $`Epilepsy vs CMAR`
## [1] 0.05440609
## 
## $`CMAR vs Common`
## [1] 0.0005775265
## 
## $`Epilepsy vs Common`
## [1] 0.1143759
## 
## $`Coding vs CMAR`
## [1] 0.7027383
## 
## $`Coding vs Epilepsy`
## [1] 0.003616045
## 
## $`CMAR and Common vs Common`
## [1] 0.004465682
## 
## $`Common vs Coding`
## [1] 0.001003417
## 
## $`CMAR and common vs Common and Coding`
## [1] 0.1669517
## 
## $`Common vs Common and Coding`
## [1] 0.0009395074
## 
## $`Full vs CMAR`
## [1] 0.04508866

Lets see how common and rare cardiac vars interact

# Plot
dot_plot <- ggplot(categorized_combined_data, aes(x = rank_cardiac, y = rank_common, color = Category)) +
  geom_point(alpha = 1) +  
  scale_color_manual(values = group_colors) + 
  labs(x = "Cardiac Variant Score Rank", y = "Common Variant Score Rank", color = "Arrest Status") +
  theme_cowplot(12) +
  theme(strip.text.x = element_text(size = 10))

dot_plot

median_cardiac <- median(categorized_combined_data$rank_cardiac, na.rm = TRUE)
median_common <- median(categorized_combined_data$rank_common, na.rm = TRUE)

# Filter for the top half
quadrant1 <- categorized_combined_data %>%
  filter(rank_cardiac <= median_cardiac & rank_common <= median_common)

quadrant2 <- categorized_combined_data %>%
  filter(rank_cardiac < median_cardiac & rank_common > median_common)

quadrant3 <- categorized_combined_data %>%
  filter(rank_cardiac > median_cardiac & rank_common < median_common)

quadrant4 <- categorized_combined_data %>%
  filter(rank_cardiac >= median_cardiac & rank_common >= median_common)


# Combine into one data frame
combined_quadrants <- bind_rows(
  quadrant1 %>% mutate(Quadrant = 'Quadrant 1'),
  quadrant2 %>% mutate(Quadrant = 'Quadrant 2'),
  quadrant3 %>% mutate(Quadrant = 'Quadrant 3'),
  quadrant4 %>% mutate(Quadrant = 'Quadrant 4')
)

# Calculate the count of individuals in each quadrant for each category
# and the total count of individuals in each category
percentage_by_category <- combined_quadrants %>%
  group_by(Category, Quadrant) %>%
  summarise(Count = n(), .groups = 'drop') %>%
  group_by(Category) %>%
  mutate(Total = sum(Count), Percentage = Count / Total * 100)

# View the result
percentage_by_category
## # A tibble: 8 × 5
## # Groups:   Category [2]
##   Category Quadrant   Count Total Percentage
##   <fct>    <chr>      <int> <int>      <dbl>
## 1 Control  Quadrant 1   235   537       43.8
## 2 Control  Quadrant 2   107   537       19.9
## 3 Control  Quadrant 3    98   537       18.2
## 4 Control  Quadrant 4    97   537       18.1
## 5 zCase    Quadrant 1    79   456       17.3
## 6 zCase    Quadrant 2    75   456       16.4
## 7 zCase    Quadrant 3    84   456       18.4
## 8 zCase    Quadrant 4   218   456       47.8
# Calculate overall proportions for each quadrant
overall_counts <- combined_quadrants %>%
  count(Quadrant) %>%
  mutate(OverallProportion = n / sum(n))

# Calculate total counts by category for scaling expected values
category_totals <- combined_quadrants %>%
  count(Category) %>%
  dplyr::rename(TotalInCategory = n)

# Calculate expected counts
expected_counts <- combined_quadrants %>%
  dplyr::select(Category, Quadrant) %>%
  distinct() %>%
  left_join(overall_counts, by = "Quadrant") %>%
  left_join(category_totals, by = "Category") %>%
  mutate(Expected = OverallProportion * TotalInCategory)

# Calculate observed counts
observed_counts <- combined_quadrants %>%
  count(Category, Quadrant) %>%
  dplyr::rename(Observed = n)

# combine
combined_for_plotting <- combined_quadrants %>%
  count(Category, Quadrant) %>%
  dplyr::rename(Observed = n) %>%
  left_join(overall_counts, by = "Quadrant") %>%
  left_join(category_totals, by = "Category") %>%
  mutate(Expected = OverallProportion * TotalInCategory, Ratio = Observed / Expected)


# Calculate the Observed/Expected ratio
combined_for_plotting <- combined_for_plotting %>%
  mutate(Ratio = Observed / Expected)

# Plot the Observed/Expected ratio
quadrant_plot <- ggplot(combined_for_plotting, aes(x = Quadrant, y = Ratio, fill = Category)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(y = "Observed/Expected Ratio", x = "Quadrant", title = "Observed/Expected Ratios by Category and Quadrant") +
  theme_cowplot(12) +
  scale_fill_manual(values = group_colors) +  
  geom_hline(yintercept = 1, linetype = "dashed", color = "Black")

quadrant_plot

# Create the contingency table
contingency_table <- xtabs(Observed ~ Category + Quadrant, data = observed_counts)

# Print the contingency table
print(contingency_table)
##          Quadrant
## Category  Quadrant 1 Quadrant 2 Quadrant 3 Quadrant 4
##   Control        235        107         98         97
##   zCase           79         75         84        218
# Chi-squared test
chi_squared_result <- chisq.test(contingency_table)

# Print the results of the chi-squared test
print(chi_squared_result)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 124.91, df = 3, p-value < 2.2e-16

Filter high cardiac-score individuals, visualize their cardiac vs. common variant scores, and assess interactions

# Filter the data for cardiac_score_quintiles > 3
top_2_quintiles <- categorized_combined_data %>%
  filter(cardiac_score_quintiles > 3)


common_density_score_plot <- ggplot(top_2_quintiles, aes(x = cardiac_variant_score, y = common_variant_score)) +
  geom_point(aes(color = Category), alpha = 0.5) +  
  geom_smooth(method = "lm", aes(group = Category, color = Category)) +  
  labs(x = "Cardiac Variant Score", y = "Common Variant Score", title = "Cardiac Variant Score vs. Common Variant Score by Category") +
  scale_color_manual(values = group_colors) +  
  theme_cowplot(12)  

# Highlight the specific point in red
common_density_score_plot <- common_density_score_plot +
  geom_point(data = subset(top_2_quintiles, ID == "CGM0000563"), aes(x = cardiac_variant_score, y = common_variant_score), color = "red")

# Print 
print(common_density_score_plot)
## `geom_smooth()` using formula = 'y ~ x'

#Do the stats
model_common_cardiac <- lm(common_variant_score ~ cardiac_variant_score * Category, data = top_2_quintiles)
summary(model_common_cardiac)
## 
## Call:
## lm(formula = common_variant_score ~ cardiac_variant_score * Category, 
##     data = top_2_quintiles)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8514 -0.3866  0.0569  0.4076  1.5320 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         -0.22986    0.05737  -4.006 7.38e-05 ***
## cardiac_variant_score               -0.16418    0.09335  -1.759   0.0794 .  
## CategoryzCase                        0.40285    0.07411   5.436 9.59e-08 ***
## cardiac_variant_score:CategoryzCase  0.16371    0.09635   1.699   0.0901 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6477 on 394 degrees of freedom
## Multiple R-squared:  0.1034, Adjusted R-squared:  0.09659 
## F-statistic: 15.15 on 3 and 394 DF,  p-value: 2.386e-09

Visualize the relationship between cardiac and epilepsy variant scores

epilepsy_density_score_plot <- ggplot(top_2_quintiles, aes(x = cardiac_variant_score, y = epilepsy_variant_score)) +
  geom_point(aes(color = Category), alpha = 0.5) +  
  geom_smooth(method = "lm", aes(group = Category, color = Category)) +  
  labs(x = "Cardiac Variant Score", y = "Epilepsy Variant Score", title = "Cardiac Variant Score vs. Epilepsy Variant Score by Category") +
  scale_color_manual(values = group_colors) +  
  theme_cowplot(12)  

# Highlight the specific point in red
epilepsy_density_score_plot <- epilepsy_density_score_plot +
  geom_point(data = subset(top_2_quintiles, ID == "CGM0000563"), aes(x = cardiac_variant_score, y = epilepsy_variant_score), color = "red")


# Print 
print(epilepsy_density_score_plot)
## `geom_smooth()` using formula = 'y ~ x'

#Do the stats
model_epilepsy_cardiac <- lm(epilepsy_variant_score ~ cardiac_variant_score * Category, data = top_2_quintiles)
summary(model_epilepsy_cardiac)
## 
## Call:
## lm(formula = epilepsy_variant_score ~ cardiac_variant_score * 
##     Category, data = top_2_quintiles)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8187 -0.4169 -0.0110  0.3811  9.3169 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -0.1577     0.0861  -1.832   0.0677 .  
## cardiac_variant_score                -0.1611     0.1401  -1.150   0.2509    
## CategoryzCase                        -0.2170     0.1112  -1.952   0.0517 .  
## cardiac_variant_score:CategoryzCase   0.9352     0.1446   6.468 2.95e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.972 on 394 degrees of freedom
## Multiple R-squared:  0.5589, Adjusted R-squared:  0.5555 
## F-statistic: 166.4 on 3 and 394 DF,  p-value: < 2.2e-16

Do the stats for the interaction data

# do wilcox_test
CMARR_epilepsy_wilcox_test_results <- wilcox.test(epilepsy_variant_score ~ Category,
                                  data = top_2_quintiles %>% filter(Category %in% c("zCase", "Control")))



CMARR_epilepsy_wilcox_test_results
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  epilepsy_variant_score by Category
## W = 11515, p-value = 4.869e-10
## alternative hypothesis: true location shift is not equal to 0
CMARR_common_wilcox_test_results <- wilcox.test(common_variant_score ~ Category,
                                  data = top_2_quintiles %>% filter(Category %in% c("zCase", "Control")))


CMARR_common_wilcox_test_results
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  common_variant_score by Category
## W = 11649, p-value = 1.063e-09
## alternative hypothesis: true location shift is not equal to 0

Calculate how well GAPS performs in PLP- individuals

no_PLPs_categorized_data <- categorized_combined_data %>%
  filter(PLP_Cardiomyopathy < 1 & probability > 0.83)

# Count the number of rows where arrest_group equals 1 (Controls)
paste("number of PLP negative DISCOVERY controls over the threshold")
## [1] "number of PLP negative DISCOVERY controls over the threshold"
nrow(no_PLPs_categorized_data %>% filter(arrest_group == 1))
## [1] 0
# Count the number of rows where arrest_group equals > 1 (Cases)
paste("number of PLP negative DISCOVERY cases over the threshold")
## [1] "number of PLP negative DISCOVERY cases over the threshold"
nrow(no_PLPs_categorized_data %>% filter(arrest_group > 1))
## [1] 22
#for replication
no_PLPs_categorized_replication_data <- categorized_replication_data %>%
  filter(PLP_Cardiomyopathy < 1 & probability > 0.7)

# Count the number of rows where arrest_group equals 1 (Controls)
paste("number of PLP negative REPLICATION controls over the threshold")
## [1] "number of PLP negative REPLICATION controls over the threshold"
nrow(no_PLPs_categorized_replication_data %>% filter(arrest_group == 1))
## [1] 0
# Count the number of rows where arrest_group equals > 1 (Cases)
paste("number of PLP negative REPLICATION cases over the threshold")
## [1] "number of PLP negative REPLICATION cases over the threshold"
nrow(no_PLPs_categorized_replication_data %>% filter(arrest_group > 1))
## [1] 22

NOW, retrain the model on the validation cohort and apply it to the discovery cohort

model_replication <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare  + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare  + Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null, 
             data = categorized_replication_data, family = binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model_replication
## 
## Call:  glm(formula = Category ~ Normalized_Heart_rate + Normalized_PR_interval + 
##     Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + 
##     Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + 
##     Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + 
##     Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + 
##     Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + 
##     Cardiomyopathy_null + Epilepsy_null, family = binomial(), 
##     data = categorized_replication_data)
## 
## Coefficients:
##                   (Intercept)          Normalized_Heart_rate  
##                    -10.632909                      -0.489645  
##        Normalized_PR_interval                 Normalized_QTc  
##                     -0.468604                      -0.520174  
##                 Normalized_BP                 Normalized_QRS  
##                      0.909731                       0.538230  
##               Normalized_LVEF               Normalized_LVESV  
##                     -0.863017                      -0.791458  
##                Normalized_SVI                Normalized_Afib  
##                      0.005339                       0.842873  
##             Normalized_LVEDVI                  Normalized_SV  
##                     -0.774451                       0.390626  
##                 Epilepsy_rare             Epilepsy_ultrarare  
##                     -0.127564                      -1.765110  
##           Cardiomyopathy_rare       Cardiomyopathy_ultrarare  
##                     -0.560834                       1.434416  
##                  PLP_Epilepsy             PLP_Cardiomyopathy  
##                     -9.841170                       0.909586  
## Cardiomyopathy_noncoding_rare        Epilepsy_noncoding_rare  
##                     -0.051308                       0.361626  
##           Cardiomyopathy_null                  Epilepsy_null  
##                      6.534668                       0.974009  
## 
## Degrees of Freedom: 125 Total (i.e. Null);  104 Residual
## Null Deviance:       174.2 
## Residual Deviance: 47.6  AIC: 91.6
scores_replication_model_on_discovery <- predict(model_replication, newdata = categorized_combined_data,  type = "link")

# Add scores to the dataframes
categorized_combined_data$scores_replication_model_on_discovery <- scores_replication_model_on_discovery

# z normalize
mean_replication_on_discovery <- mean(categorized_combined_data$scores_replication_model_on_discovery)
sd_replication_on_discovery <- sd(categorized_combined_data$scores_replication_model_on_discovery)


scores_replication_model_on_discovery_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = (scores_replication_model_on_discovery - mean_replication_on_discovery)/sd_replication_on_discovery, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-2.5, 2.5) +  
    theme_cowplot(12)

scores_replication_model_on_discovery_plot
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# wilcox.test
wilcox.test_result_replication_on_discovery <- wilcox.test(scores_replication_model_on_discovery ~ Category, data = categorized_combined_data)

# View the results
print(wilcox.test_result_replication_on_discovery)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  scores_replication_model_on_discovery by Category
## W = 84150, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

Calculate GAPS on betas trained and applied to replication data (within-replication model)

scores_replication_model_on_replication <- predict(model_replication, newdata = categorized_replication_data,  type = "link")

# Add scores to the dataframes
categorized_replication_data$scores_replication_model_on_replication <- scores_replication_model_on_replication

# z normalize
mean_replication_on_replication <- mean(categorized_replication_data$scores_replication_model_on_replication)
sd_replication_on_replication <- sd(categorized_replication_data$scores_replication_model_on_replication)

scores_replication_model_on_replication_plot <-  ggplot(categorized_replication_data, aes(x = Category, y = (scores_replication_model_on_replication - mean_replication_on_replication)/sd_replication_on_replication, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-2.5, 2.5) +  
    theme_cowplot(12)

scores_replication_model_on_replication_plot
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# t-test
wilcox.test_replication_on_replication <- wilcox.test(scores_replication_model_on_replication ~ Category, data = categorized_replication_data)

# View the results
print(wilcox.test_replication_on_replication)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  scores_replication_model_on_replication by Category
## W = 97, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

Now calculate AUC for the original data using replication-derived betas (reverse validation)

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    probability_replication_on_discovery = plogis(scores_replication_model_on_discovery),
  )

# Convert 'Category' into a binary format: 0 for control, 1 for case
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)

roc_result_full <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$probability_replication_on_discovery, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

auc(roc_result_full)
## Area under the curve: 0.6564
# Calculate the confidence interval for the AUC
ci_auc <- ci.auc(roc_result_full)

# Print the confidence interval
print(ci_auc)
## 95% CI: 0.6221-0.6906 (DeLong)

Replication model on discovery data Odds Ratio

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    replication_model_on_discovery_rank = rank(scores_replication_model_on_discovery, ties.method = "first"),
    replication_model_on_discovery_score_quintiles = ceiling(replication_model_on_discovery_rank / (n() / 5)),
  )

 
# Apply function to each odds category
replication_model_on_discovery_combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, replication_model_on_discovery_score_quintiles)


plot(log2(replication_model_on_discovery_combined_odds_summary$odds_ratio), 
     ylim = c(-2,10
     ), 
     pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI", 
     xaxt = "n", col = "#3C8C78")
lines(1:length(replication_model_on_discovery_combined_odds_summary$odds_ratio), log2(replication_model_on_discovery_combined_odds_summary$odds_ratio), col = "#3C8C78")

# Add error bars for 95% CI - Combined
arrows(x0 = 1:length(replication_model_on_discovery_combined_odds_summary$odds_ratio), 
       y0 = log2(replication_model_on_discovery_combined_odds_summary$lower_ci), 
       y1 = log2(replication_model_on_discovery_combined_odds_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#3C8C78")

ANCESTRY

Determine optimal PCs where R2 is not dropping and when RMSE does not rise

# Filter the data frame to include only rows where Category == 1
filtered_categorized_combined_data <- subset(categorized_combined_data, Category == "Control")

# Define the number of PCs to try
num_pcs <- 1:20

# Initialize a list to store model results
cv_results <- data.frame(PCs = num_pcs, R2 = NA, RMSE = NA)

# Perform cross-validation for each number of PCs
for (i in num_pcs) {
  formula <- as.formula(paste0("scores ~ ", paste0("PC", 1:i, collapse = " + ")))
  model <- train(formula, data = filtered_categorized_combined_data, method = "lm",
                 trControl = trainControl(method = "cv", number = 10))
  
  # Store R² and RMSE for each model
  cv_results$R2[i] <- model$results$Rsquared
  cv_results$RMSE[i] <- model$results$RMSE
}

# View the results to choose the optimal number of PCs
print(cv_results)
##    PCs         R2      RMSE
## 1    1 0.07556031 0.8552564
## 2    2 0.17565864 0.8135567
## 3    3 0.18756609 0.8140989
## 4    4 0.18713526 0.8042655
## 5    5 0.18644367 0.8081648
## 6    6 0.18811397 0.8069692
## 7    7 0.20011136 0.8089212
## 8    8 0.18121482 0.8048622
## 9    9 0.17593950 0.8129213
## 10  10 0.17775437 0.8122386
## 11  11 0.16777788 0.8134462
## 12  12 0.17388580 0.8178958
## 13  13 0.16904333 0.8160226
## 14  14 0.16621049 0.8224263
## 15  15 0.16633919 0.8210972
## 16  16 0.14802014 0.8287491
## 17  17 0.16367316 0.8192258
## 18  18 0.16472407 0.8213723
## 19  19 0.16679931 0.8194001
## 20  20 0.16339218 0.8260343
ggplot(cv_results, aes(x = PCs)) +
  geom_line(aes(y = R2), color = "blue") +
  geom_line(aes(y = RMSE), color = "red") +
  ylab("Model Performance (R² / RMSE)") +
  ggtitle("Cross-Validation of PCs") +
  theme_minimal()

Regress out the influence of the ancestry PCS

# Run the regression of scores on C1 to C10
ancestry_regression_model <- lm(scores ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
                                data = filtered_categorized_combined_data)

# Get the coefficients from the regression
ancestry_coefficients <- coef(ancestry_regression_model)

# Calculate the adjusted_scores by removing the effects of C1 to C10
categorized_combined_data$adjusted_scores <- categorized_combined_data$scores - 
                                             (ancestry_coefficients["(Intercept)"] + 
                                              categorized_combined_data$PC1 * ancestry_coefficients["PC1"] + 
                                              categorized_combined_data$PC2 * ancestry_coefficients["PC2"] + 
                                              categorized_combined_data$PC3 * ancestry_coefficients["PC3"] + 
                                              categorized_combined_data$PC4 * ancestry_coefficients["PC4"] + 
                                              categorized_combined_data$PC5 * ancestry_coefficients["PC5"] +
                                              categorized_combined_data$PC6 * ancestry_coefficients["PC6"] +
                                              categorized_combined_data$PC7 * ancestry_coefficients["PC7"] +
                                              categorized_combined_data$PC8 * ancestry_coefficients["PC8"] +
                                              categorized_combined_data$PC9 * ancestry_coefficients["PC9"] +
                                              categorized_combined_data$PC10 * ancestry_coefficients["PC10"])

Get adjusted probabilities

# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_combined_data <- categorized_combined_data %>%
  mutate(adjusted_probability = plogis(adjusted_scores))

Perform PC clustering

# Extract PCA columns (PC1 to PC20)
pca_columns <- categorized_combined_data[, grep("^PC\\d+$", names(categorized_combined_data))]

# Perform GMM clustering
gmm_result <- Mclust(pca_columns, G = 4)  

# Extract the cluster labels from the GMM result
initial_clusters <- gmm_result$classification

# Select the data points that belong to Cluster 1
cluster1_data <- pca_columns[initial_clusters == 1, ]

# Perform GMM sub-clustering on Cluster 1
sub_gmm_result <- Mclust(cluster1_data, G = 2)  

# Integrate the sub-clustering results back into your overall clustering
final_clusters <- initial_clusters

# Assign new cluster labels to the points in Cluster 1 based on the sub-clustering
# We use +4 here to differentiate from the initial clusters (assuming there were 4 clusters initially)
final_clusters[initial_clusters == 1] <- sub_gmm_result$classification + 4


# Select the data points that belong to Cluster 2
cluster2_data <- pca_columns[initial_clusters == 2, ]

# Perform GMM sub-clustering on Cluster 2
sub_gmm_result2 <- Mclust(cluster2_data, G = 2) 

# Integrate the sub-clustering results back into your overall clustering
final_clusters[initial_clusters == 2] <- sub_gmm_result2$classification + 6  

Add cluster assignments to data

# Add the final cluster assignments to the original dataframe
categorized_combined_data$cluster_gmm <- as.factor(final_clusters)

# Visualize the clustering results
ggplot(categorized_combined_data, aes(x = PC1, y = PC2, color = cluster_gmm)) +
  geom_point() +
  ggtitle("GMM Clustering of PCA Data") +
  theme_minimal() +
  scale_color_manual(values = rainbow(length(unique(final_clusters)))) +
  theme(legend.position = "bottom")

contingency_table <- table(categorized_combined_data$cluster_gmm, categorized_combined_data$Ancestry)

contingency_df <- as.data.frame(contingency_table)
colnames(contingency_df) <- c("Cluster", "Ancestry", "Count")

# Bar plot to show the makeup of each cluster by ancestry
ggplot(contingency_df, aes(x = Cluster, y = Count, fill = Ancestry)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Makeup of Each Cluster by Ancestry") +
  xlab("Cluster") + ylab("Count") +
  theme_minimal() +
  theme(legend.position = "bottom")

Plot the PCs, ancestry clusters informed by self-reported identities, and plot adjusted data

# Define the mapping of clusters to ancestry
cluster_to_ancestry <- c("6" = "EUR", "7" = "HIS", "5" = "AFR", "3" = "OTH", "4" = "OTH", "8" = "HIS")

# Add the new column to the dataframe
categorized_combined_data$cluster_gmm_Ancestry <- as.character(categorized_combined_data$cluster_gmm)

# Reassign the clusters based on the mapping
categorized_combined_data$cluster_gmm_Ancestry <- sapply(categorized_combined_data$cluster_gmm, function(x) cluster_to_ancestry[as.character(x)])

# Convert the new column to a factor
categorized_combined_data$cluster_gmm_Ancestry <- factor(categorized_combined_data$cluster_gmm_Ancestry, levels = unique(cluster_to_ancestry))

# Create a summary dataframe with counts
summary_data <- categorized_combined_data %>%
  group_by(Category, cluster_gmm_Ancestry) %>%
  summarise(count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Category'. You can override using the
## `.groups` argument.
# Define the palette
palette_colors <- wes_palette("FantasticFox1", length(unique(categorized_combined_data$cluster_gmm_Ancestry)), type = "continuous")


# Create a bar plot with percentages and counts
ancestry_makeup <- ggplot(categorized_combined_data, aes(x = Category, fill = cluster_gmm_Ancestry)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  ggtitle("Makeup of cluster_gmm_Ancestry within Each Category") +
  xlab("Category") +
  ylab("Percentage") +
  theme(legend.position = "bottom") +
  scale_fill_manual(values = palette_colors) +
  geom_text(data = summary_data, aes(label = count, y = count / sum(count), group = cluster_gmm_Ancestry),
            position = position_fill(vjust = 0.5), color = "black") + 
  theme_cowplot(12)


# Visualize the clustering results
clusters <- ggplot(categorized_combined_data, aes(x = PC1, y = PC2, color = cluster_gmm_Ancestry)) +
  geom_point() +
  ggtitle("GMM Clustering of PCA Data") +
  theme_minimal() +
  scale_color_manual(values = palette_colors) +
  theme(legend.position = "bottom") + 
  theme_cowplot(12)


# Generalized Plot for Scores
ancestry_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = scores, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") +
  theme_cowplot(12) +
  ylim(-3, 3)

# Generalized Plot for Adjusted Scores
ancestry_adjusted_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = adjusted_scores, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Adjusted Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Adjusted Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") + 
  theme_cowplot(12) +
  ylim(-3, 3)

# Adjusted Scores by Category (not ancestry-specific)
ancestry_adjusted_total_scores <- ggplot(categorized_combined_data, aes(x = Category, y = adjusted_scores, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Adjusted Scores by Category",
       x = "Category",
       y = "Adjusted Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") + 
  theme_cowplot(12) +
  ylim(-3, 3)

# Specific Plot for Cardiac Variant Scores
cardiac_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = cardiac_variant_score, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Cardiac Variant Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Cardiac Variant Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") +
  theme_cowplot(12) +
  ylim(-3, 3)

# Specific Plot for Epilepsy Variant Scores
epilepsy_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = epilepsy_variant_score, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Epilepsy Variant Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Epilepsy Variant Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") +
  theme_cowplot(12) +
  ylim(-3, 3)

# Specific Plot for Common Variant Scores
common_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = common_variant_score, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Common Variant Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Common Variant Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") +
  theme_cowplot(12) +
  ylim(-3, 3)

# Specific Plot for Noncoding Variant Scores
noncoding_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = noncoding_variant_score, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Noncoding Variant Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Noncoding Variant Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") +
  theme_cowplot(12) +
  ylim(-3, 3)

# Print Plots
clusters

ancestry_makeup

suppressWarnings(print(ancestry_scores))

suppressWarnings(print(ancestry_adjusted_scores))

suppressWarnings(print(ancestry_adjusted_total_scores))

suppressWarnings(print(cardiac_variant_scores))

suppressWarnings(print(epilepsy_variant_scores))
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?

suppressWarnings(print(common_variant_scores))

suppressWarnings(print(noncoding_variant_scores))
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?

# Wilcoxon Test for Adjusted Scores
wilcox.test(adjusted_scores ~ Category, data = categorized_combined_data)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  adjusted_scores by Category
## W = 79303, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# Wilcoxon Test within each ancestry group (Adjusted Scores)
adjusted_wilcox_results <- categorized_combined_data %>%
  group_by(cluster_gmm_Ancestry) %>%
  summarise(
    p_value = wilcox.test(adjusted_scores ~ Category)$p.value
  )
print(adjusted_wilcox_results)
## # A tibble: 4 × 2
##   cluster_gmm_Ancestry       p_value
##   <fct>                        <dbl>
## 1 EUR                  0.00000000433
## 2 HIS                  0.0181       
## 3 AFR                  0.0000162    
## 4 OTH                  0.00548
# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
  group_by(cluster_gmm_Ancestry) %>%
  summarise(
    p_value = wilcox.test(scores ~ Category)$p.value
  )
print(scores_wilcox_results)
## # A tibble: 4 × 2
##   cluster_gmm_Ancestry  p_value
##   <fct>                   <dbl>
## 1 EUR                  1.75e-12
## 2 HIS                  2.43e- 3
## 3 AFR                  1.17e- 7
## 4 OTH                  1.32e- 3

Interaction model to evaluate ancestry x PGS interactions

# Interaction model: Does ancestry modify the effect of the variables on case/control?
interaction_model <- glm(Category ~ Normalized_LVEF * cluster_gmm_Ancestry + 
                         Normalized_Heart_rate * cluster_gmm_Ancestry + 
                         Normalized_SV * cluster_gmm_Ancestry + 
                         Normalized_LVEDVI * cluster_gmm_Ancestry + 
                         Normalized_Afib * cluster_gmm_Ancestry + 
                         Normalized_PR_interval * cluster_gmm_Ancestry + 
                         Normalized_LVESV * cluster_gmm_Ancestry + 
                         Normalized_SVI * cluster_gmm_Ancestry + 
                         Normalized_BP * cluster_gmm_Ancestry + 
                         Normalized_QTc * cluster_gmm_Ancestry, 
                         data = categorized_combined_data, family = binomial)

summary(interaction_model)
## 
## Call:
## glm(formula = Category ~ Normalized_LVEF * cluster_gmm_Ancestry + 
##     Normalized_Heart_rate * cluster_gmm_Ancestry + Normalized_SV * 
##     cluster_gmm_Ancestry + Normalized_LVEDVI * cluster_gmm_Ancestry + 
##     Normalized_Afib * cluster_gmm_Ancestry + Normalized_PR_interval * 
##     cluster_gmm_Ancestry + Normalized_LVESV * cluster_gmm_Ancestry + 
##     Normalized_SVI * cluster_gmm_Ancestry + Normalized_BP * cluster_gmm_Ancestry + 
##     Normalized_QTc * cluster_gmm_Ancestry, family = binomial, 
##     data = categorized_combined_data)
## 
## Coefficients:
##                                                 Estimate Std. Error z value
## (Intercept)                                     0.985996   0.189985   5.190
## Normalized_LVEF                                -0.056041   0.160970  -0.348
## cluster_gmm_AncestryHIS                        -2.618199   0.270454  -9.681
## cluster_gmm_AncestryAFR                        -1.501416   0.291432  -5.152
## cluster_gmm_AncestryOTH                         0.085666   0.347552   0.246
## Normalized_Heart_rate                          -0.148686   0.127560  -1.166
## Normalized_SV                                  -0.129167   0.144625  -0.893
## Normalized_LVEDVI                               0.099653   0.166314   0.599
## Normalized_Afib                                -0.119832   0.128547  -0.932
## Normalized_PR_interval                          0.233999   0.117628   1.989
## Normalized_LVESV                                0.006579   0.184338   0.036
## Normalized_SVI                                 -0.007414   0.176301  -0.042
## Normalized_BP                                   0.100302   0.128746   0.779
## Normalized_QTc                                  0.144560   0.129390   1.117
## Normalized_LVEF:cluster_gmm_AncestryHIS         0.161473   0.251740   0.641
## Normalized_LVEF:cluster_gmm_AncestryAFR        -0.057026   0.243657  -0.234
## Normalized_LVEF:cluster_gmm_AncestryOTH         0.149925   0.379303   0.395
## cluster_gmm_AncestryHIS:Normalized_Heart_rate   0.279794   0.205353   1.363
## cluster_gmm_AncestryAFR:Normalized_Heart_rate   0.248188   0.199291   1.245
## cluster_gmm_AncestryOTH:Normalized_Heart_rate   0.021320   0.282468   0.075
## cluster_gmm_AncestryHIS:Normalized_SV           0.018127   0.241476   0.075
## cluster_gmm_AncestryAFR:Normalized_SV          -0.005352   0.227930  -0.023
## cluster_gmm_AncestryOTH:Normalized_SV          -0.099748   0.367285  -0.272
## cluster_gmm_AncestryHIS:Normalized_LVEDVI      -0.059796   0.277594  -0.215
## cluster_gmm_AncestryAFR:Normalized_LVEDVI      -0.012314   0.242344  -0.051
## cluster_gmm_AncestryOTH:Normalized_LVEDVI       0.162975   0.409278   0.398
## cluster_gmm_AncestryHIS:Normalized_Afib        -0.063225   0.207940  -0.304
## cluster_gmm_AncestryAFR:Normalized_Afib         0.092741   0.189536   0.489
## cluster_gmm_AncestryOTH:Normalized_Afib         0.318674   0.279610   1.140
## cluster_gmm_AncestryHIS:Normalized_PR_interval -0.107243   0.195593  -0.548
## cluster_gmm_AncestryAFR:Normalized_PR_interval -0.006562   0.185089  -0.035
## cluster_gmm_AncestryOTH:Normalized_PR_interval  0.140872   0.354449   0.397
## cluster_gmm_AncestryHIS:Normalized_LVESV        0.611566   0.302932   2.019
## cluster_gmm_AncestryAFR:Normalized_LVESV        0.002846   0.265806   0.011
## cluster_gmm_AncestryOTH:Normalized_LVESV        0.084992   0.343738   0.247
## cluster_gmm_AncestryHIS:Normalized_SVI         -0.031809   0.270997  -0.117
## cluster_gmm_AncestryAFR:Normalized_SVI          0.088692   0.237651   0.373
## cluster_gmm_AncestryOTH:Normalized_SVI          0.064705   0.353658   0.183
## cluster_gmm_AncestryHIS:Normalized_BP           0.081335   0.195361   0.416
## cluster_gmm_AncestryAFR:Normalized_BP           0.286601   0.198513   1.444
## cluster_gmm_AncestryOTH:Normalized_BP          -0.185752   0.315792  -0.588
## cluster_gmm_AncestryHIS:Normalized_QTc         -0.256980   0.198895  -1.292
## cluster_gmm_AncestryAFR:Normalized_QTc         -0.261544   0.239132  -1.094
## cluster_gmm_AncestryOTH:Normalized_QTc         -0.395822   0.295921  -1.338
##                                                Pr(>|z|)    
## (Intercept)                                    2.10e-07 ***
## Normalized_LVEF                                  0.7277    
## cluster_gmm_AncestryHIS                         < 2e-16 ***
## cluster_gmm_AncestryAFR                        2.58e-07 ***
## cluster_gmm_AncestryOTH                          0.8053    
## Normalized_Heart_rate                            0.2438    
## Normalized_SV                                    0.3718    
## Normalized_LVEDVI                                0.5490    
## Normalized_Afib                                  0.3512    
## Normalized_PR_interval                           0.0467 *  
## Normalized_LVESV                                 0.9715    
## Normalized_SVI                                   0.9665    
## Normalized_BP                                    0.4359    
## Normalized_QTc                                   0.2639    
## Normalized_LVEF:cluster_gmm_AncestryHIS          0.5212    
## Normalized_LVEF:cluster_gmm_AncestryAFR          0.8150    
## Normalized_LVEF:cluster_gmm_AncestryOTH          0.6926    
## cluster_gmm_AncestryHIS:Normalized_Heart_rate    0.1730    
## cluster_gmm_AncestryAFR:Normalized_Heart_rate    0.2130    
## cluster_gmm_AncestryOTH:Normalized_Heart_rate    0.9398    
## cluster_gmm_AncestryHIS:Normalized_SV            0.9402    
## cluster_gmm_AncestryAFR:Normalized_SV            0.9813    
## cluster_gmm_AncestryOTH:Normalized_SV            0.7859    
## cluster_gmm_AncestryHIS:Normalized_LVEDVI        0.8295    
## cluster_gmm_AncestryAFR:Normalized_LVEDVI        0.9595    
## cluster_gmm_AncestryOTH:Normalized_LVEDVI        0.6905    
## cluster_gmm_AncestryHIS:Normalized_Afib          0.7611    
## cluster_gmm_AncestryAFR:Normalized_Afib          0.6246    
## cluster_gmm_AncestryOTH:Normalized_Afib          0.2544    
## cluster_gmm_AncestryHIS:Normalized_PR_interval   0.5835    
## cluster_gmm_AncestryAFR:Normalized_PR_interval   0.9717    
## cluster_gmm_AncestryOTH:Normalized_PR_interval   0.6910    
## cluster_gmm_AncestryHIS:Normalized_LVESV         0.0435 *  
## cluster_gmm_AncestryAFR:Normalized_LVESV         0.9915    
## cluster_gmm_AncestryOTH:Normalized_LVESV         0.8047    
## cluster_gmm_AncestryHIS:Normalized_SVI           0.9066    
## cluster_gmm_AncestryAFR:Normalized_SVI           0.7090    
## cluster_gmm_AncestryOTH:Normalized_SVI           0.8548    
## cluster_gmm_AncestryHIS:Normalized_BP            0.6772    
## cluster_gmm_AncestryAFR:Normalized_BP            0.1488    
## cluster_gmm_AncestryOTH:Normalized_BP            0.5564    
## cluster_gmm_AncestryHIS:Normalized_QTc           0.1963    
## cluster_gmm_AncestryAFR:Normalized_QTc           0.2741    
## cluster_gmm_AncestryOTH:Normalized_QTc           0.1810    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1370.0  on 992  degrees of freedom
## Residual deviance: 1091.5  on 949  degrees of freedom
## AIC: 1179.5
## 
## Number of Fisher Scoring iterations: 4

Now redo the ancestry corrections by adjusting the variable prior to fully incorporating into GAPS

# List of Scores to Correct
score_list <- c("Normalized_QTc", "Normalized_BP", "Normalized_SVI", "Normalized_LVESV", 
                "Normalized_PR_interval", "Normalized_Afib", "Normalized_LVEDVI", "Normalized_SV", 
                "Normalized_Heart_rate", "Normalized_LVEF", "Epilepsy_rare", "Epilepsy_ultrarare", 
                "Cardiomyopathy_rare", "Cardiomyopathy_ultrarare", "PLP_Epilepsy", "PLP_Cardiomyopathy",
                "Cardiomyopathy_noncoding_rare", "Epilepsy_noncoding_rare", "Cardiomyopathy_null", "Epilepsy_null")

# Adjust Each Score in the Training Data 
for (score in score_list) {
  
  # Run the regression of the score on PC1 to PC10
  ancestry_regression_model <- lm(as.formula(paste(score, "~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10")), 
                                  data = filtered_categorized_combined_data)
  
  # Get the coefficients from the regression
  ancestry_coefficients <- coef(ancestry_regression_model)
  
  # Adjust the score by removing ancestry effects
  categorized_combined_data[[paste0(score, "_ANCESTRY_adjusted")]] <- categorized_combined_data[[score]] - 
    (ancestry_coefficients["(Intercept)"] + 
     categorized_combined_data$PC1 * ancestry_coefficients["PC1"] + 
     categorized_combined_data$PC2 * ancestry_coefficients["PC2"] + 
     categorized_combined_data$PC3 * ancestry_coefficients["PC3"] + 
     categorized_combined_data$PC4 * ancestry_coefficients["PC4"] + 
     categorized_combined_data$PC5 * ancestry_coefficients["PC5"] + 
     categorized_combined_data$PC6 * ancestry_coefficients["PC6"] + 
     categorized_combined_data$PC7 * ancestry_coefficients["PC7"] + 
     categorized_combined_data$PC8 * ancestry_coefficients["PC8"] + 
     categorized_combined_data$PC9 * ancestry_coefficients["PC9"] + 
     categorized_combined_data$PC10 * ancestry_coefficients["PC10"])
}

# Fit Logistic Regression Using Adjusted Scores
adjusted_score_list <- paste0(score_list, "_ANCESTRY_adjusted")
model_ANCESTRY_adjusted <- glm(as.formula(paste("Category ~", paste(adjusted_score_list, collapse = " + "))),
                               data = categorized_combined_data, family = binomial())

# Predict Scores for Training Data
categorized_combined_data$Scores_ANCESTRY_adjusted <- predict(model_ANCESTRY_adjusted, type = "link")

# Plot Adjusted Scores
ancestry_ADJUSTED_pre_by_category <- ggplot(categorized_combined_data, aes(x = Category, y = Scores_ANCESTRY_adjusted, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Adjusted Scores by Category", x = "Category", y = "Adjusted Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") + theme_cowplot(12) + ylim(-3, 3)

ancestry_ADJUSTED_pre_by_category
## Warning: Removed 35 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ancestry_ADJUSTED_pre_by_ancestry <- ggplot(categorized_combined_data, 
                                            aes(x = cluster_gmm_Ancestry, 
                                                y = Scores_ANCESTRY_adjusted, 
                                                fill = Category, 
                                                pattern = Category)) +
  geom_boxplot_pattern(outlier.shape = NA, 
                       notch = TRUE, 
                       pattern_density = 0.1, 
                       pattern_fill = "black", 
                       pattern_spacing = 0.01, 
                       pattern_angle = 90) + 
  labs(title = "Adjusted Scores by Ancestry", 
       x = "Ancestry Cluster", 
       y = "Adjusted Scores") +
  scale_fill_manual(values = group_colors) +
  scale_pattern_manual(values = c("stripe", "crosshatch", "dot", "none")) + 
  theme(legend.position = "bottom") + 
  theme_cowplot(12) + 
  ylim(-3, 3)

ancestry_ADJUSTED_pre_by_ancestry
## Warning: Removed 35 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Compute ROC and AUC for Training Data
categorized_combined_data <- categorized_combined_data %>%
  mutate(adjusted_probability = plogis(Scores_ANCESTRY_adjusted))

roc_result_full_PRE_ADJUSTED <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$adjusted_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

print(paste("Training AUC:", auc(roc_result_full_PRE_ADJUSTED)))
## [1] "Training AUC: 0.758878107746088"
# Adjust Each Score in the Replication Data
for (score in score_list) {
  
# Run ancestry adjustment regression in the training set
  ancestry_regression_model <- lm(as.formula(paste(score, "~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10")), 
                                  data = filtered_categorized_combined_data)
  
  # Get the ancestry correction coefficients
  ancestry_coefficients <- coef(ancestry_regression_model)
  
  # Apply ancestry adjustment to the replication dataset
  categorized_replication_data[[paste0(score, "_ANCESTRY_adjusted")]] <- categorized_replication_data[[score]] - 
    (ancestry_coefficients["(Intercept)"] + 
     categorized_replication_data$PC1 * ancestry_coefficients["PC1"] + 
     categorized_replication_data$PC2 * ancestry_coefficients["PC2"] + 
     categorized_replication_data$PC3 * ancestry_coefficients["PC3"] + 
     categorized_replication_data$PC4 * ancestry_coefficients["PC4"] + 
     categorized_replication_data$PC5 * ancestry_coefficients["PC5"] + 
     categorized_replication_data$PC6 * ancestry_coefficients["PC6"] + 
     categorized_replication_data$PC7 * ancestry_coefficients["PC7"] + 
     categorized_replication_data$PC8 * ancestry_coefficients["PC8"] + 
     categorized_replication_data$PC9 * ancestry_coefficients["PC9"] + 
     categorized_replication_data$PC10 * ancestry_coefficients["PC10"])
}

# Predict Using the Ancestry-Adjusted Replication Data
adjusted_score_list_replication <- paste0(score_list, "_ANCESTRY_adjusted")

# Predict using the model, ensuring we use ONLY adjusted scores
categorized_replication_data$Scores_ANCESTRY_adjusted_replication <- predict(model_ANCESTRY_adjusted, 
                                                                             newdata = categorized_replication_data[, adjusted_score_list_replication, drop = FALSE], 
                                                                             type = "link")

# Convert to Probabilities
categorized_replication_data <- categorized_replication_data %>%
  mutate(adjusted_probability_replication = plogis(Scores_ANCESTRY_adjusted_replication))

# Compute ROC and AUC for Replication Data
roc_result_full_adjusted <- roc(categorized_replication_data$binary_outcome, 
                                categorized_replication_data$adjusted_probability_replication)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste("Replication AUC:", auc(roc_result_full_adjusted)))
## [1] "Replication AUC: 0.874778649127245"
plot(roc_result_full_adjusted, xlim = c(1, 0))

# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
  group_by(cluster_gmm_Ancestry) %>%
  summarise(
    p_value = wilcox.test(Scores_ANCESTRY_adjusted ~ Category)$p.value
  )
print(scores_wilcox_results)
## # A tibble: 4 × 2
##   cluster_gmm_Ancestry  p_value
##   <fct>                   <dbl>
## 1 EUR                  6.46e-16
## 2 HIS                  5.57e- 6
## 3 AFR                  8.54e- 9
## 4 OTH                  1.07e- 4

Now plot the datausing the ancestry corrections by adjusting the variable prior to fully incorporating into GAPS

# List of Common Variant Scores to Correct
common_variant_scores <- c("Normalized_QTc", "Normalized_BP", "Normalized_SVI", "Normalized_LVESV", 
                           "Normalized_PR_interval", "Normalized_Afib", "Normalized_LVEDVI", "Normalized_SV", 
                           "Normalized_Heart_rate", "Normalized_LVEF")

# Adjust Each Score in the Training Data
for (score in common_variant_scores) {
  
  # Run the regression of the score on PC1 to PC10
  ancestry_regression_model <- lm(as.formula(paste(score, "~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10")), 
                                  data = filtered_categorized_combined_data)
  
  # Get the coefficients from the regression
  ancestry_coefficients <- coef(ancestry_regression_model)
  
  # Adjust the score by removing ancestry effects
  categorized_combined_data[[paste0(score, "_ANCESTRY_adjusted")]] <- categorized_combined_data[[score]] - 
    (ancestry_coefficients["(Intercept)"] + 
     categorized_combined_data$PC1 * ancestry_coefficients["PC1"] + 
     categorized_combined_data$PC2 * ancestry_coefficients["PC2"] + 
     categorized_combined_data$PC3 * ancestry_coefficients["PC3"] + 
     categorized_combined_data$PC4 * ancestry_coefficients["PC4"] + 
     categorized_combined_data$PC5 * ancestry_coefficients["PC5"] + 
     categorized_combined_data$PC6 * ancestry_coefficients["PC6"] + 
     categorized_combined_data$PC7 * ancestry_coefficients["PC7"] + 
     categorized_combined_data$PC8 * ancestry_coefficients["PC8"] + 
     categorized_combined_data$PC9 * ancestry_coefficients["PC9"] + 
     categorized_combined_data$PC10 * ancestry_coefficients["PC10"])
}

# Fit Logistic Regression Using Adjusted Scores
adjusted_common_variant_scores <- paste0(common_variant_scores, "_ANCESTRY_adjusted")
model_common_variant_ADJUSTED_pre <- glm(as.formula(paste("Category ~", paste(adjusted_common_variant_scores, collapse = " + "))),
                            data = categorized_combined_data, family = binomial())

# Predict Common Variant Score
categorized_combined_data$Common_Variant_Score_ADJUSTED_pre <- predict(model_common_variant_ADJUSTED_pre, type = "link")

# Plot Common Variant Score by Category
common_variant_plot_by_category_ADJUSTED_pre <- ggplot(categorized_combined_data, aes(x = Category, y = Common_Variant_Score_ADJUSTED_pre, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Common Variant Score by Category", x = "Category", y = "Common Variant Score") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") + theme_cowplot(12) + ylim(-3, 3)

common_variant_plot_by_category_ADJUSTED_pre
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

# Plot Common Variant Score by Ancestry
common_variant_plot_by_ancestry_ADJUSTED_pre <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = Common_Variant_Score_ADJUSTED_pre, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Common Variant Score by Ancestry", x = "Ancestry Cluster", y = "Common Variant Score") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") + theme_cowplot(12) + ylim(-3, 3)

common_variant_plot_by_ancestry_ADJUSTED_pre
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
  summarise(
    p_value = wilcox.test(Common_Variant_Score_ADJUSTED_pre ~ Category)$p.value
  )
print(scores_wilcox_results)
##        p_value
## 1 5.584507e-20
# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
  group_by(cluster_gmm_Ancestry) %>%
  summarise(
    p_value = wilcox.test(Common_Variant_Score_ADJUSTED_pre ~ Category)$p.value
  )
print(scores_wilcox_results)
## # A tibble: 4 × 2
##   cluster_gmm_Ancestry     p_value
##   <fct>                      <dbl>
## 1 EUR                  0.000000680
## 2 HIS                  0.0000165  
## 3 AFR                  0.000238   
## 4 OTH                  0.0857

Compute the odds ratios in the highest quintile across ancestry before and after the post-integration adjustments

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    rank_adjusted = rank(adjusted_scores, ties.method = "first"),
    score_adjusted_quintiles = ceiling(rank_adjusted / (n() / 5))
  )

#subset by ancestry
AFR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "AFR", ]
HIS <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "HIS", ]
EUR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "EUR", ]
OTH <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "OTH", ]

# Apply the OR funciton you made way earlier
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_quintiles)
combined_odds_summary_AFR = calculate_odds_ratios(AFR, Category, score_quintiles)
combined_odds_summary_HIS = calculate_odds_ratios(HIS, Category, score_quintiles)
combined_odds_summary_EUR = calculate_odds_ratios(EUR, Category, score_quintiles)
combined_odds_summary_OTH = calculate_odds_ratios(OTH, Category, score_quintiles)

combined_odds_summary
## # A tibble: 5 × 8
##   score_quintiles count_category_1 count_category_case  odds odds_ratio
##             <dbl>            <int>               <int> <dbl>      <dbl>
## 1               1              162                  36 0.222       1   
## 2               2              138                  61 0.442       1.99
## 3               3              116                  82 0.707       3.18
## 4               4               89                 110 1.24        5.56
## 5               5               32                 167 5.22       23.5 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR 
## # A tibble: 5 × 8
##   score_quintiles count_category_1 count_category_case  odds odds_ratio
##             <dbl>            <int>               <int> <dbl>      <dbl>
## 1               1               82                  21 0.256       1   
## 2               2               56                  18 0.321       1.26
## 3               3               38                  21 0.553       2.16
## 4               4               13                  13 1           3.90
## 5               5                4                  17 4.25       16.6 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS 
## # A tibble: 5 × 8
##   score_quintiles count_category_1 count_category_case  odds odds_ratio
##             <dbl>            <int>               <int> <dbl>      <dbl>
## 1               1               69                  10 0.145       1   
## 2               2               56                   9 0.161       1.11
## 3               3               41                  13 0.317       2.19
## 4               4               49                  17 0.347       2.39
## 5               5               15                   9 0.6         4.14
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR 
## # A tibble: 5 × 8
##   score_quintiles count_category_1 count_category_case  odds odds_ratio
##             <dbl>            <int>               <int> <dbl>      <dbl>
## 1               1                8                   2  0.25       1   
## 2               2               19                  24  1.26       5.05
## 3               3               29                  39  1.34       5.38
## 4               4               23                  69  3         12   
## 5               5               11                 112 10.2       40.7 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH 
## # A tibble: 5 × 8
##   score_quintiles count_category_1 count_category_case  odds odds_ratio
##             <dbl>            <int>               <int> <dbl>      <dbl>
## 1               1                3                   3  1          1   
## 2               2                7                  10  1.43       1.43
## 3               3                8                   9  1.12       1.12
## 4               4                4                  11  2.75       2.75
## 5               5                2                  29 14.5       14.5 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
# Apply the OR funciton you made way earlier

combined_odds_summary_adjust= calculate_odds_ratios(categorized_combined_data, Category, score_adjusted_quintiles)
combined_odds_summary_AFR_adjust = calculate_odds_ratios(AFR, Category, score_adjusted_quintiles)
combined_odds_summary_HIS_adjust = calculate_odds_ratios(HIS, Category, score_adjusted_quintiles)
combined_odds_summary_EUR_adjust = calculate_odds_ratios(EUR, Category, score_adjusted_quintiles)
combined_odds_summary_OTH_adjust = calculate_odds_ratios(OTH, Category, score_adjusted_quintiles)

combined_odds_summary_adjust
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_case  odds odds_ratio
##                      <dbl>            <int>               <int> <dbl>      <dbl>
## 1                        1              134                  64 0.478       1   
## 2                        2              131                  68 0.519       1.09
## 3                        3              118                  80 0.678       1.42
## 4                        4              105                  94 0.895       1.87
## 5                        5               49                 150 3.06        6.41
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR_adjust 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_case  odds odds_ratio
##                      <dbl>            <int>               <int> <dbl>      <dbl>
## 1                        1               52                  16 0.308      1    
## 2                        2               40                  11 0.275      0.894
## 3                        3               45                  16 0.356      1.16 
## 4                        4               45                  25 0.556      1.81 
## 5                        5               11                  22 2          6.5  
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS_adjust 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_case  odds odds_ratio
##                      <dbl>            <int>               <int> <dbl>      <dbl>
## 1                        1               56                   8 0.143       1   
## 2                        2               57                  11 0.193       1.35
## 3                        3               44                  13 0.295       2.07
## 4                        4               43                  17 0.395       2.77
## 5                        5               30                   9 0.3         2.1 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR_adjust 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_case  odds odds_ratio
##                      <dbl>            <int>               <int> <dbl>      <dbl>
## 1                        1               23                  30  1.30       1   
## 2                        2               27                  41  1.52       1.16
## 3                        3               20                  44  2.2        1.69
## 4                        4               14                  47  3.36       2.57
## 5                        5                6                  84 14         10.7 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH_adjust 
## # A tibble: 5 × 8
##   score_adjusted_quinti…¹ count_category_1 count_category_case   odds odds_ratio
##                     <dbl>            <int>               <int>  <dbl>      <dbl>
## 1                       1                3                  10  3.33       1    
## 2                       2                7                   5  0.714      0.214
## 3                       3                9                   7  0.778      0.233
## 4                       4                3                   5  1.67       0.5  
## 5                       5                2                  35 17.5        5.25 
## # ℹ abbreviated name: ¹​score_adjusted_quintiles
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>

Extract the odds ratios by ancestry for plotting

# Extracting quintile 5 data for each summary
quintile_5_data <- data.frame(
  group = c("Total","AFR", "HIS", "EUR", "OTH"),
  odds_ratio = c(combined_odds_summary$odds_ratio[5],
                 combined_odds_summary_AFR$odds_ratio[5],
                 combined_odds_summary_HIS$odds_ratio[5],
                 combined_odds_summary_EUR$odds_ratio[5],
                 combined_odds_summary_OTH$odds_ratio[5]),
  lower_ci = c(combined_odds_summary$lower_ci[5],
               combined_odds_summary_AFR$lower_ci[5],
               combined_odds_summary_HIS$lower_ci[5],
               combined_odds_summary_EUR$lower_ci[5],
               combined_odds_summary_OTH$lower_ci[5]),
  upper_ci = c(combined_odds_summary$upper_ci[5],
               combined_odds_summary_AFR$upper_ci[5],
               combined_odds_summary_HIS$upper_ci[5],
               combined_odds_summary_EUR$upper_ci[5],
               combined_odds_summary_OTH$upper_ci[5])
)

# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)

# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6.5)

# Plotting
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups", xaxt = "n", col = "#3C8474")
axis(1, at = 1:5, labels = quintile_5_data$group)

# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci, 
       code = 3, angle = 90, length = 0.05, col = "#3C8C78")

Now lets compute the odds ratios by 5th quintile and ancestry corrected post-integration of the variables

# Extract quintile 5 data for each summary
quintile_5_data <- data.frame(
  group = c("Total","AFR", "HIS", "EUR", "OTH"),
  odds_ratio = c(combined_odds_summary_adjust$odds_ratio[5],
                 combined_odds_summary_AFR_adjust$odds_ratio[5],
                 combined_odds_summary_HIS_adjust$odds_ratio[5],
                 combined_odds_summary_EUR_adjust$odds_ratio[5],
                 combined_odds_summary_OTH_adjust$odds_ratio[5]),
  lower_ci = c(combined_odds_summary_adjust$lower_ci[5],
               combined_odds_summary_AFR_adjust$lower_ci[5],
               combined_odds_summary_HIS_adjust$lower_ci[5],
               combined_odds_summary_EUR_adjust$lower_ci[5],
               combined_odds_summary_OTH_adjust$lower_ci[5]),
  upper_ci = c(combined_odds_summary_adjust$upper_ci[5],
               combined_odds_summary_AFR_adjust$upper_ci[5],
               combined_odds_summary_HIS_adjust$upper_ci[5],
               combined_odds_summary_EUR_adjust$upper_ci[5],
               combined_odds_summary_OTH_adjust$upper_ci[5])
)

# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)

# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6.5)

# Plot
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups, adjusted", xaxt = "n", col = "black")
axis(1, at = 1:5, labels = quintile_5_data$group)

# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci, 
       code = 3, angle = 90, length = 0.05, col = "black")

Now lets compute the odds ratios by 5th quintile and ancestry corrected pre-integration of the variables

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    rank_adjusted = rank(Scores_ANCESTRY_adjusted, ties.method = "first"),
    score_adjusted_quintiles = ceiling(rank_adjusted / (n() / 5))
  )

#subset by ancestry
AFR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "AFR", ]
HIS <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "HIS", ]
EUR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "EUR", ]
OTH <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "OTH", ]

# Apply the OR funciton you made way earlier
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_adjusted_quintiles)
combined_odds_summary_AFR = calculate_odds_ratios(AFR, Category, score_adjusted_quintiles)
combined_odds_summary_HIS = calculate_odds_ratios(HIS, Category, score_adjusted_quintiles)
combined_odds_summary_EUR = calculate_odds_ratios(EUR, Category, score_adjusted_quintiles)
combined_odds_summary_OTH = calculate_odds_ratios(OTH, Category, score_adjusted_quintiles)

combined_odds_summary
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_case  odds odds_ratio
##                      <dbl>            <int>               <int> <dbl>      <dbl>
## 1                        1              156                  42 0.269       1   
## 2                        2              141                  58 0.411       1.53
## 3                        3              123                  75 0.610       2.26
## 4                        4               88                 111 1.26        4.69
## 5                        5               29                 170 5.86       21.8 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_case  odds odds_ratio
##                      <dbl>            <int>               <int> <dbl>      <dbl>
## 1                        1               56                  11 0.196       1   
## 2                        2               48                  11 0.229       1.17
## 3                        3               42                  19 0.452       2.30
## 4                        4               36                  23 0.639       3.25
## 5                        5               11                  26 2.36       12.0 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS 
## # A tibble: 5 × 8
##   score_adjusted_quinti…¹ count_category_1 count_category_case   odds odds_ratio
##                     <dbl>            <int>               <int>  <dbl>      <dbl>
## 1                       1               64                   6 0.0938       1   
## 2                       2               66                  11 0.167        1.78
## 3                       3               54                  13 0.241        2.57
## 4                       4               38                  21 0.553        5.89
## 5                       5                8                   7 0.875        9.33
## # ℹ abbreviated name: ¹​score_adjusted_quintiles
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR 
## # A tibble: 5 × 8
##   score_adjusted_quinti…¹ count_category_1 count_category_case   odds odds_ratio
##                     <dbl>            <int>               <int>  <dbl>      <dbl>
## 1                       1               29                  15  0.517       1   
## 2                       2               22                  33  1.5         2.9 
## 3                       3               19                  39  2.05        3.97
## 4                       4               13                  59  4.54        8.77
## 5                       5                7                 100 14.3        27.6 
## # ℹ abbreviated name: ¹​score_adjusted_quintiles
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_case  odds odds_ratio
##                      <dbl>            <int>               <int> <dbl>      <dbl>
## 1                        1                7                  10  1.43       1   
## 2                        2                5                   3  0.6        0.42
## 3                        3                8                   4  0.5        0.35
## 4                        4                1                   8  8          5.6 
## 5                        5                3                  37 12.3        8.63
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
# Extract quintile 5 data for each summary
quintile_5_data <- data.frame(
  group = c("Total","AFR", "HIS", "EUR", "OTH"),
  odds_ratio = c(combined_odds_summary_adjust$odds_ratio[5],
                 combined_odds_summary_AFR_adjust$odds_ratio[5],
                 combined_odds_summary_HIS_adjust$odds_ratio[5],
                 combined_odds_summary_EUR_adjust$odds_ratio[5],
                 combined_odds_summary_OTH_adjust$odds_ratio[5]),
  lower_ci = c(combined_odds_summary_adjust$lower_ci[5],
               combined_odds_summary_AFR_adjust$lower_ci[5],
               combined_odds_summary_HIS_adjust$lower_ci[5],
               combined_odds_summary_EUR_adjust$lower_ci[5],
               combined_odds_summary_OTH_adjust$lower_ci[5]),
  upper_ci = c(combined_odds_summary_adjust$upper_ci[5],
               combined_odds_summary_AFR_adjust$upper_ci[5],
               combined_odds_summary_HIS_adjust$upper_ci[5],
               combined_odds_summary_EUR_adjust$upper_ci[5],
               combined_odds_summary_OTH_adjust$upper_ci[5])
)

# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)

# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6.5)

# Plot
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups, adjusted", xaxt = "n", col = "black")
axis(1, at = 1:5, labels = quintile_5_data$group)

# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci, 
       code = 3, angle = 90, length = 0.05, col = "black")

Plot the ROC for GAPS on the data adjusted for ancestry post-integration

# Convert 'Category' into a binary format: 0 for control, 1 for case
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)
roc_result_full_adjusted <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$adjusted_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

auc(roc_result_full_adjusted)
## Area under the curve: 0.7589

Plot the ROC for GAPS on the data adjusted for ancestry post-integration on the replication cohort

# Run the regression of the score on PC1 to PC10
ancestry_regression_model <- lm(scores ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
                                data = filtered_categorized_combined_data)

ancestry_coefficients <- coef(ancestry_regression_model)

# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_replication_data$adjusted_scores_replication <- categorized_replication_data$scores_replication - 
                                             (ancestry_coefficients["(Intercept)"] + 
                                              categorized_replication_data$PC1 * ancestry_coefficients["PC1"] + 
                                              categorized_replication_data$PC2 * ancestry_coefficients["PC2"] + 
                                              categorized_replication_data$PC3 * ancestry_coefficients["PC3"] + 
                                              categorized_replication_data$PC4 * ancestry_coefficients["PC4"] + 
                                              categorized_replication_data$PC5 * ancestry_coefficients["PC5"] +
                                              categorized_replication_data$PC6 * ancestry_coefficients["PC6"] +
                                              categorized_replication_data$PC7 * ancestry_coefficients["PC7"] +
                                              categorized_replication_data$PC8 * ancestry_coefficients["PC8"] +
                                              categorized_replication_data$PC9 * ancestry_coefficients["PC9"] +
                                              categorized_replication_data$PC10 * ancestry_coefficients["PC10"] )


# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_replication_data <- categorized_replication_data %>%
  mutate(adjusted_probability_replication = plogis(adjusted_scores_replication))


# Compute the ROC curve
roc_result_full_adjusted <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$adjusted_probability_replication)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_result_full_adjusted)
## Area under the curve: 0.7918
# Plot the ROC curve with the x-axis reversed
plot(roc_result_full_adjusted,xlim = c(1, 0))

Perform iterations of 50(5-fold) validation of the discovery cohort and plot the dostribution of outcomes

# number of iterations
n_repeats <- 50

# Store results
auc_results <- data.frame()

for (i in 1:n_repeats) {
  set.seed(i)  

  # cross-validation method
  cv_folds <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)

  # Train models again
  cv_model_cardiomyopathy_rare <- suppressWarnings(train(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null, 
                                        data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_epilepsy_rare <- suppressWarnings(train(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null, 
                                  data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_noncoding_rare <- suppressWarnings(train(Category ~ Epilepsy_noncoding_rare + Cardiomyopathy_noncoding_rare, 
                                   data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_common <- suppressWarnings(train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, 
                           data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_coding_rare <- suppressWarnings(train(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_null + Epilepsy_null, 
                                data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_epilepsy_and_common <- suppressWarnings(train(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, 
                                        data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_cardiac_and_common <- suppressWarnings(train(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, 
                                       data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_common_and_coding <- suppressWarnings(train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_null + Epilepsy_null, 
                                      data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

cv_model_full <- suppressWarnings(
  train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + 
          Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + 
          Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + 
          PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + 
          Cardiomyopathy_null + Epilepsy_null, 
                                    data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  # Store results for this iteration
  temp_results <- data.frame(
    Model = c("Cardiomyopathy Rare", "Epilepsy Rare", "Noncoding Rare", "Common Traits", "Coding Rare", 
              "Epilepsy & Common", "Cardiac & Common", "Common & Coding", "Common, Coding & Noncoding"),
    AUC = c(
      max(cv_model_cardiomyopathy_rare$results$ROC), 
      max(cv_model_epilepsy_rare$results$ROC), 
      max(cv_model_noncoding_rare$results$ROC), 
      max(cv_model_common$results$ROC), 
      max(cv_model_coding_rare$results$ROC), 
      max(cv_model_epilepsy_and_common$results$ROC), 
      max(cv_model_cardiac_and_common$results$ROC), 
      max(cv_model_common_and_coding$results$ROC),
      max(cv_model_full$results$ROC)
    ),
    Iteration = i
  )

  auc_results <- rbind(auc_results, temp_results)
}


# WA color palette for Millenial enjoyment
AUC_palette <- c(wes_palette("Royal1"), wes_palette("Royal2"))

# Boxplot AUC distribution
ggplot(auc_results, aes(x = Model, y = AUC, fill = Model)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = AUC_palette) +  # Apply custom palette
  theme_minimal() +
  labs(title = "AUC Distributions Across Models", x = "Model", y = "AUC") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_cowplot()

library(dplyr)

summary_stats <- auc_results %>%
  group_by(Model) %>%
  summarise(Median_AUC = median(AUC),
            IQR_AUC = IQR(AUC))

print(summary_stats)
## # A tibble: 9 × 3
##   Model                      Median_AUC IQR_AUC
##   <chr>                           <dbl>   <dbl>
## 1 Cardiac & Common                0.717 0.00474
## 2 Cardiomyopathy Rare             0.683 0.00300
## 3 Coding Rare                     0.689 0.00460
## 4 Common & Coding                 0.720 0.00697
## 5 Common Traits                   0.663 0.00671
## 6 Common, Coding & Noncoding      0.738 0.00610
## 7 Epilepsy & Common               0.684 0.00727
## 8 Epilepsy Rare                   0.640 0.00332
## 9 Noncoding Rare                  0.581 0.00289

Now down-sample 20 different times and re-run the 5-fold validations so that the ancestries match.

# Initialize a dataframe to store all AUCs
auc_results <- data.frame(
  Outer_Fold = integer(),
  Inner_Fold = integer(),
  AUC = numeric(),
  stringsAsFactors = FALSE
)

# Define number of outer folds
outer_folds <- 20
inner_folds <- 5

# Outer loop: 20 iterations
for (i in 1:outer_folds) {
  
  # Create a new randomly downsampled dataset (ensuring balanced case/control per ancestry)
  downsampled_data <- bind_rows(
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "EUR", Category == "zCase") %>% slice_sample(n = 90),
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "EUR", Category == "Control") %>% slice_sample(n = 90),
    
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "HIS", Category == "zCase") %>% slice_sample(n = 58),
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "HIS", Category == "Control") %>% slice_sample(n = 58),
    
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "AFR", Category == "zCase") %>% slice_sample(n = 90),
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "AFR", Category == "Control") %>% slice_sample(n = 90),
    
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "OTH", Category == "zCase") %>% slice_sample(n = 24),
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "OTH", Category == "Control") %>% slice_sample(n = 24)
  )

  # Ensure binary outcome is properly coded
  downsampled_data$binary_outcome <- ifelse(downsampled_data$Category == "zCase", 1, 0)

  # Create 5 stratified folds **by both ancestry and category**
  folds <- downsampled_data %>%
    group_by(cluster_gmm_Ancestry, Category) %>%
    mutate(Fold = sample(rep(1:inner_folds, length.out = n()))) %>%
    ungroup()

  # Train and test across the 5 folds within this downsampled dataset
  inner_auc_values <- numeric(inner_folds)

  for (j in 1:inner_folds) {
    
    # Split into training and testing sets
    train_data <- folds %>% filter(Fold != j) %>% dplyr::select(-Fold)
    test_data  <- folds %>% filter(Fold == j) %>% dplyr::select(-Fold)
    
    # Train model
    cv_model <- suppressWarnings(
  train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + 
          Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + 
          Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + 
          PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + 
          Cardiomyopathy_null + Epilepsy_null, 
        data = train_data, method = "glm", family = binomial)
)
 
    # Get predicted probabilities for the positive class (zCase)
    test_data$predicted_prob <- predict(cv_model, test_data, type = "prob")[, "zCase"]

    # Compute ROC curve and AUC
    roc_result <- roc(test_data$binary_outcome, test_data$predicted_prob)
    inner_auc_values[j] <- auc(roc_result)
    
    # Store the AUC result
    auc_results <- rbind(auc_results, data.frame(Outer_Fold = i, Inner_Fold = j, AUC = inner_auc_values[j]))
  }
  
  # Store the mean AUC from the 5 inner folds for this outer iteration
  auc_results <- rbind(auc_results, data.frame(Outer_Fold = i, Inner_Fold = NA, AUC = mean(inner_auc_values)))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Ensure Inner_Fold is treated as character for filtering
auc_results <- auc_results %>%
  mutate(Inner_Fold = as.character(Inner_Fold))

# Extract inner fold AUCs for the histogram
inner_auc_data <- auc_results %>%
  filter(Inner_Fold != "Mean")  # Exclude previous Mean entries if they exist

# Compute mean AUC per outer fold
outer_auc_means <- inner_auc_data %>%
  group_by(Outer_Fold) %>%
  summarise(AUC = mean(AUC)) %>%
  ungroup()

# Ensure AUC is numeric
inner_auc_data$AUC <- as.numeric(inner_auc_data$AUC)
outer_auc_means$AUC <- as.numeric(outer_auc_means$AUC)

# Print to verify correct means
print(outer_auc_means)
## # A tibble: 20 × 2
##    Outer_Fold   AUC
##         <int> <dbl>
##  1          1 0.668
##  2          2 0.676
##  3          3 0.665
##  4          4 0.676
##  5          5 0.699
##  6          6 0.686
##  7          7 0.678
##  8          8 0.684
##  9          9 0.675
## 10         10 0.682
## 11         11 0.652
## 12         12 0.682
## 13         13 0.673
## 14         14 0.604
## 15         15 0.669
## 16         16 0.672
## 17         17 0.676
## 18         18 0.682
## 19         19 0.681
## 20         20 0.676

Now plot the distribution of all the down-sampled, ancestry-matched AUC results, layering the means of each outer fold on top

# Create a bar plot with percentages and counts
ancestry_makeup <- ggplot(downsampled_data, aes(x = Category, fill = cluster_gmm_Ancestry)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  ggtitle("Makeup of cluster_gmm_Ancestry within Each Category") +
  xlab("Category") +
  ylab("Percentage") +
  theme(legend.position = "bottom") +
  scale_fill_manual(values = palette_colors) +
  theme_cowplot(12)


ancestry_makeup

# Create histogram with cowplot styling
auc_histogram <- ggplot(inner_auc_data, aes(x = AUC)) +
  geom_histogram(binwidth = 0.01, fill = "lightblue", color = "black") +  
    geom_vline(data = outer_auc_means, aes(xintercept = AUC), color = "black", linetype = "dashed", size = 0.5) + 
  geom_vline(xintercept = 0.5, color = "red", linetype = "solid", size = 1) +
  xlab("AUC") +
  ylab("Frequency") +
  ggtitle("Histogram of Inner Fold AUCs with Outer Fold Means") +
  theme_cowplot(12)

auc_histogram

# Compute mean of outer fold AUCs
mean_outer_auc <- outer_auc_means %>%
  summarise(Mean_AUC = mean(AUC)) %>%
  pull(Mean_AUC)

# Print the mean AUC value
print(paste("Mean AUC across outer folds:", mean_outer_auc))
## [1] "Mean AUC across outer folds: 0.672768927539058"
# Load the reporter data
reporter_data <- read.delim("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/Reporter_analysis.txt", check.names = FALSE)

reporter_long <- reporter_data %>%
  pivot_longer(cols = everything(), names_to = "Condition", values_to = "Activity")

library(ggplot2)

ggplot(reporter_long, aes(x = Condition, y = Activity, fill = Condition)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.6, color = "black") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none") +
  ylab("Reporter Activity") +
  xlab("") +
  coord_cartesian(ylim = c(0, max(reporter_long$Activity, na.rm = TRUE) * 1.1)) +
  ggtitle("Reporter Assay Activity by Condition")